smol-training-playbook / app /scripts /export-txt.mjs
tfrere's picture
tfrere HF Staff
feat: cherry-pick improvements from PR#13
fe5248a
#!/usr/bin/env node
/**
* Export article to TXT format for book publishing
*
* This script exports the article to a simple text format with custom tags:
* - <f> NAME ANCHOR DESCRIPTION </f> for figures/images
* - <t> NAME DESCRIPTION </t> for tables
* - <c> CODE | DESCRIPTION </c> for code blocks
* - <ic> CODE </ic> for inline code
* - <il> FORMULA </il> for inline LaTeX
* - <l> katex-number </l> for LaTeX display formulas (references exported PNGs)
* - <b> TEXT </b> for bold
* - <i> TEXT </i> for italic
* - <a href="URL"> TEXT </a> for links
* - <ref> TEXT </ref> for literature/citation references
* - <n> TITLE | CONTENT </n> for note boxes
*
* Usage:
* node scripts/export-txt.mjs
* npm run export:txt
*
* Output: dist/article.txt
*/
import { spawn } from 'node:child_process';
import { setTimeout as delay } from 'node:timers/promises';
import { chromium } from 'playwright';
import { resolve } from 'node:path';
import { promises as fs } from 'node:fs';
import process from 'node:process';
async function run(command, args = [], options = {}) {
return new Promise((resolvePromise, reject) => {
const child = spawn(command, args, { stdio: 'inherit', shell: false, ...options });
child.on('error', reject);
child.on('exit', (code) => {
if (code === 0) resolvePromise(undefined);
else reject(new Error(`${command} ${args.join(' ')} exited with code ${code}`));
});
});
}
async function waitForServer(urlOrFn, timeoutMs = 60000) {
const getUrl = typeof urlOrFn === 'function' ? urlOrFn : () => urlOrFn;
const start = Date.now();
while (Date.now() - start < timeoutMs) {
try {
const url = getUrl();
if (!url) { await delay(200); continue; }
const res = await fetch(url);
if (res.ok) return;
} catch { }
await delay(500);
}
const lastUrl = getUrl();
throw new Error(`Server did not start in time: ${lastUrl || '(unknown url)'}`);
}
function parseArgs(argv) {
const out = {};
for (const arg of argv.slice(2)) {
if (!arg.startsWith('--')) continue;
const [k, v] = arg.replace(/^--/, '').split('=');
out[k] = v === undefined ? true : v;
}
return out;
}
function parseBoolean(value, defaultValue) {
if (value === undefined) return defaultValue;
if (value === true) return true;
const v = String(value).trim().toLowerCase();
if (['1', 'true', 'yes', 'y', 'on'].includes(v)) return true;
if (['0', 'false', 'no', 'n', 'off'].includes(v)) return false;
return defaultValue;
}
function slugify(text) {
return String(text || '')
.normalize('NFKD')
.replace(/\p{Diacritic}+/gu, '')
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '')
.slice(0, 120) || 'article';
}
/**
* Clean text content: remove extra whitespace, normalize line breaks
*/
function cleanText(text) {
return String(text || '')
.replace(/\s+/g, ' ')
.trim();
}
/**
* Strip HTML tags from text
*/
function stripHtml(html) {
return String(html || '')
.replace(/<[^>]*>/g, '')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.trim();
}
/**
* Convert heading level to markdown syntax
*/
function headingToMarkdown(level, text) {
const hashes = '#'.repeat(Math.min(level, 6));
return `${hashes} ${text}`;
}
// ─── Code block wrapping utilities ──────────────────────────────────────────
function wrapLineWithIndent(line, maxWidth) {
if (line.length <= maxWidth) return [line];
const indentMatch = line.match(/^\s*/);
const indent = indentMatch ? indentMatch[0] : '';
const indentLen = indent.length;
const available = Math.max(10, maxWidth - indentLen);
let rest = line.slice(indentLen);
const out = [];
while (rest.length > available) {
let breakPos = -1;
for (let i = available; i >= 1; i--) {
if (/\s/.test(rest[i - 1])) { breakPos = i; break; }
}
if (breakPos === -1) breakPos = available;
out.push(indent + rest.slice(0, breakPos).replace(/\s+$/g, ''));
rest = rest.slice(breakPos).replace(/^\s+/g, '');
}
out.push(indent + rest);
return out;
}
function wrapCodeTextAccountingForTags(codeText, maxWidth) {
const width = Number(maxWidth);
if (!Number.isFinite(width) || width <= 0) return String(codeText || '');
const baseLines = String(codeText || '').split('\n');
const wrappedLines = [];
for (const line of baseLines) wrappedLines.push(...wrapLineWithIndent(line, width));
if (wrappedLines.length === 0) return '';
if (wrappedLines.length === 1) {
const maxInner = width - '<c>'.length - '</c>'.length;
if (wrappedLines[0].length > maxInner && maxInner > 0) {
return wrapLineWithIndent(wrappedLines[0], maxInner).join('\n');
}
return wrappedLines[0];
}
const firstMaxInner = width - '<c>'.length;
if (firstMaxInner > 0 && wrappedLines[0].length > firstMaxInner) {
const rewrappedFirst = wrapLineWithIndent(wrappedLines[0], firstMaxInner);
wrappedLines.splice(0, 1, ...rewrappedFirst);
}
const lastMaxInner = width - '</c>'.length;
const lastIdx = wrappedLines.length - 1;
if (lastMaxInner > 0 && wrappedLines[lastIdx].length > lastMaxInner) {
const rewrappedLast = wrapLineWithIndent(wrappedLines[lastIdx], lastMaxInner);
wrappedLines.splice(lastIdx, 1, ...rewrappedLast);
}
return wrappedLines.join('\n');
}
function wrapCodeBlocksInTxt(txt, maxWidth = 80) {
const width = Number(maxWidth);
if (!Number.isFinite(width) || width <= 0) return txt;
return String(txt || '').replace(/<c>([\s\S]*?)<\/c>/g, (_m, inner) => {
const wrappedInner = wrapCodeTextAccountingForTags(inner, width);
return `<c>${wrappedInner}</c>`;
});
}
/**
* Extract and convert article content to TXT format
*/
async function extractArticleContent(page) {
return await page.evaluate(() => {
const output = [];
let globalCounter = 0; // Global counter for all visual elements (matches screenshot script)
const katexMap = new Map(); // Track unique katex formulas for referencing
// Helper: clean text
const cleanText = (text) => String(text || '').replace(/\s+/g, ' ').trim();
// Helper: strip HTML
const stripHtml = (html) => {
const div = document.createElement('div');
div.innerHTML = html;
return cleanText(div.textContent || '');
};
// Helper: get element ID or generate anchor
const getAnchor = (el) => {
if (el.id) return el.id;
// Try to find ID in parent figure
const figure = el.closest('figure');
if (figure?.id) return figure.id;
return '';
};
// Helper: parse caption to extract name and description
const parseCaptionText = (captionText, type = 'Figure') => {
if (!captionText) return { name: '', description: '' };
// Try to match patterns like:
// "Figure 1: Description"
// "Table 2: Description"
// "Fig. 3: Description"
const patterns = [
new RegExp(`^(${type}\\s*\\d+[a-z]?)\\s*[:\\-–—]\\s*(.+)$`, 'i'),
new RegExp(`^(Fig\\.?\\s*\\d+[a-z]?)\\s*[:\\-–—]\\s*(.+)$`, 'i'),
new RegExp(`^(Table\\s*\\d+[a-z]?)\\s*[:\\-–—]\\s*(.+)$`, 'i'),
];
for (const pattern of patterns) {
const match = captionText.match(pattern);
if (match) {
return { name: match[1].trim(), description: match[2].trim() };
}
}
// No pattern found, entire text is description
return { name: '', description: captionText.trim() };
};
// ── Reusable inline content processor ──────────────────────────────
// Walks inline DOM nodes and produces tagged text for bold, italic,
// links, inline code, inline katex, citations/references.
const processInlineContent = (parentNode) => {
let result = '';
const walk = (n) => {
if (n.nodeType === Node.TEXT_NODE) {
result += n.textContent;
return;
}
const tag = n.tagName?.toLowerCase();
// Inline code (not inside <pre>)
if (tag === 'code' && !n.closest('pre')) {
result += `<ic>${cleanText(n.textContent)}</ic>`;
return;
}
// Inline KaTeX
if (n.classList?.contains('katex')) {
result += `<il>${cleanText(n.textContent || '')}</il>`;
return;
}
// Bold
if (tag === 'strong' || tag === 'b') {
const inner = processInlineContent(n);
if (inner) result += `<b>${inner}</b>`;
return;
}
// Italic
if (tag === 'em' || tag === 'i') {
const inner = processInlineContent(n);
if (inner) result += `<i>${inner}</i>`;
return;
}
// Links
if (tag === 'a') {
const href = n.getAttribute('href') || '';
const inner = processInlineContent(n);
if (href && inner) {
result += `<a href="${href}">${inner}</a>`;
} else if (inner) {
result += inner;
}
return;
}
// Superscript (often used for citations/footnotes)
if (tag === 'sup') {
const inner = cleanText(n.textContent || '');
if (inner) result += `<ref>${inner}</ref>`;
return;
}
// Mark / highlight
if (tag === 'mark') {
const inner = processInlineContent(n);
if (inner) result += `<b>${inner}</b>`;
return;
}
// Skip display-level elements inside inline context
if (['div', 'figure', 'table', 'pre', 'ul', 'ol'].includes(tag)) return;
// Recurse into children
if (n.childNodes) {
n.childNodes.forEach(walk);
}
};
if (parentNode.childNodes) {
parentNode.childNodes.forEach(walk);
}
return result.trim();
};
// Process main content
const main = document.querySelector('main');
if (!main) return 'Error: main element not found';
// Helper: get all visual elements in DOM order from the whole document
// (same scope as screenshot script which queries the full page)
const allVisualElements = Array.from(document.querySelectorAll('.html-embed, .table-scroll > table, .image-wrapper, .katex-display'));
const elementIndexMap = new Map();
// Helper: slugify (same as screenshot script)
const slugifyLabel = (text) => String(text || '')
.normalize('NFKD').replace(/[\u0300-\u036f]/g, '')
.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice(0, 120);
// Helper: extract label from element (same logic as screenshot script)
const getElementLabel = (el) => {
if (el.classList.contains('html-embed')) {
const btn = el.querySelector('.html-embed__download');
const filename = btn?.getAttribute('data-filename') || '';
if (filename) return filename;
const title = el.querySelector('.html-embed__title');
if (title?.textContent) return title.textContent;
}
const getAttr = (name) => el.getAttribute(name) || '';
const direct = getAttr('data-title') || getAttr('data-name') || getAttr('data-label')
|| getAttr('data-slug') || getAttr('aria-label') || getAttr('title') || getAttr('id');
if (direct) return direct;
if (el.tagName.toLowerCase() === 'table') {
const caption = el.querySelector('caption');
if (caption) return caption.textContent || '';
}
const img = el.querySelector('img');
if (img) return img.getAttribute('alt') || img.getAttribute('title') || '';
const heading = el.querySelector('h1,h2,h3,h4,h5,h6');
if (heading) return heading.textContent || '';
return '';
};
// Helper: get element type (same as screenshot script)
const getElementType = (el) => {
if (el.matches('.html-embed')) return 'embed';
if (el.matches('.table-scroll > table')) return 'table';
if (el.matches('.image-wrapper')) return 'image';
if (el.matches('.katex-display')) return 'katex';
return 'unknown';
};
// Pre-process: assign screenshot-matching baseName to visual elements
allVisualElements.forEach((el, idx) => {
const type = getElementType(el);
const label = getElementLabel(el);
const slug = slugifyLabel(label);
const baseName = `${idx + 1}-${type}${slug ? '--' + slug : ''}`;
elementIndexMap.set(el, baseName);
});
// Walk through all child nodes
const processNode = (node) => {
const tag = node.tagName?.toLowerCase();
// Headings
if (/^h[1-6]$/.test(tag)) {
const level = parseInt(tag[1]);
const text = cleanText(node.textContent);
const hashes = '#'.repeat(level);
output.push(`\n${hashes} ${text}\n`);
return;
}
// Paragraphs
if (tag === 'p') {
const text = node.textContent?.trim();
if (text) {
// Process inline elements within paragraph
output.push(processInlineContent(node) + '\n');
}
return;
}
// Display math (KaTeX)
if (node.classList?.contains('katex-display')) {
const baseName = elementIndexMap.get(node);
if (baseName) {
output.push(`<l>${baseName}</l>\n`);
}
return;
}
// Code blocks
if (tag === 'pre') {
const code = node.querySelector('code');
if (code) {
const codeText = code.textContent || '';
const language = code.className.match(/language-(\w+)/)?.[1] || '';
// Try to find description from parent or next sibling
let description = '';
const figure = node.closest('figure');
if (figure) {
const caption = figure.querySelector('figcaption');
if (caption) description = stripHtml(caption.innerHTML);
}
if (description) {
output.push(`<c>${codeText.trim()} | ${description}</c>\n`);
} else {
output.push(`<c>${codeText.trim()}</c>\n`);
}
}
return;
}
// Tables
if (tag === 'table') {
// Check if this table is in a .table-scroll container (visual element)
const tableScroll = node.closest('.table-scroll');
const baseName = tableScroll ? elementIndexMap.get(node) : null;
// Skip if not a tracked table, but still recurse
if (!baseName) {
return;
}
const figure = node.closest('figure');
let name = '';
let description = '';
let anchor = '';
if (figure) {
anchor = getAnchor(figure);
const caption = figure.querySelector('figcaption');
if (caption) {
const captionText = stripHtml(caption.innerHTML);
const parsed = parseCaptionText(captionText, 'Table');
name = parsed.name;
description = parsed.description;
}
}
// If no name found, use the screenshot baseName
if (!name) {
name = baseName;
}
// Build the tag
const parts = [name];
if (anchor) parts.push(anchor);
if (description) parts.push(description);
output.push(`<t>${parts.join(' | ')}</t>\n`);
// Extract table as simple text representation
const rows = Array.from(node.querySelectorAll('tr'));
const tableText = rows.map(row => {
const cells = Array.from(row.querySelectorAll('th, td'));
return cells.map(cell => cleanText(cell.textContent)).join(' | ');
}).join('\n');
output.push(tableText + '\n\n');
return;
}
// Standalone .image-wrapper (not inside a <figure>)
if (node.classList?.contains('image-wrapper') && !node.closest('figure')) {
const baseName = elementIndexMap.get(node);
if (baseName) {
const img = node.querySelector('img');
const alt = img?.alt || '';
const parts = [baseName];
if (alt) parts.push(alt);
output.push(`<f>${parts.join(' | ')}</f>\n\n`);
}
return;
}
// Figures (images, embeds)
if (tag === 'figure') {
const img = node.querySelector('img');
const htmlEmbed = node.querySelector('.html-embed, .html-embed--screenshot');
const imageWrapper = node.querySelector('.image-wrapper');
const caption = node.querySelector('figcaption');
// Skip if it's not really a figure (no img, no embed, no caption)
if (!img && !htmlEmbed && !imageWrapper && !caption) return;
// Try to find the screenshot baseName from the visual element
const visualElement = htmlEmbed || imageWrapper || node;
const baseName = elementIndexMap.get(visualElement);
if (!baseName) return; // Skip if not tracked
let name = '';
let anchor = getAnchor(node);
let description = '';
if (caption) {
const captionText = stripHtml(caption.innerHTML);
const parsed = parseCaptionText(captionText, 'Figure');
name = parsed.name;
description = parsed.description;
}
// Get image alt text as fallback for description
if (!description && img?.alt) {
description = img.alt;
}
// If no name found in caption, use the screenshot baseName
if (!name) {
name = baseName;
}
// Build the tag: <f> NAME ANCHOR DESCRIPTION </f>
const parts = [name];
if (anchor) parts.push(anchor);
if (description) parts.push(description);
output.push(`<f>${parts.join(' | ')}</f>\n\n`);
return;
}
// Lists
if (tag === 'ul' || tag === 'ol') {
const items = Array.from(node.querySelectorAll(':scope > li'));
items.forEach((item, idx) => {
const bullet = tag === 'ul' ? '-' : `${idx + 1}.`;
const text = processInlineContent(item);
output.push(`${bullet} ${text}\n`);
});
output.push('\n');
return;
}
// Blockquotes
if (tag === 'blockquote') {
const text = processInlineContent(node);
output.push(`> ${text}\n\n`);
return;
}
// Notes (Note component and Sidenote)
if (node.classList?.contains('note') || node.classList?.contains('sidenote')) {
const titleEl = node.querySelector('.note__title, .note-title');
const title = cleanText(titleEl?.textContent || '');
// Process body content excluding the title element
let body = '';
const bodyNodes = Array.from(node.children).filter(
(c) => c !== titleEl && !c.classList?.contains('note__title') && !c.classList?.contains('note-title'),
);
for (const child of bodyNodes) {
body += processInlineContent(child) + ' ';
}
body = body.replace(/\s+/g, ' ').trim();
if (title && body) {
output.push(`<n><b>${title}</b> | ${body}</n>\n\n`);
} else if (title) {
output.push(`<n><b>${title}</b></n>\n\n`);
} else if (body) {
output.push(`<n>${body}</n>\n\n`);
}
return;
}
// Recurse through children for unhandled elements
if (node.children && node.children.length > 0 && !['pre', 'code', 'table', 'figure'].includes(tag)) {
try {
Array.from(node.children).forEach(processNode);
} catch (e) {
console.error('Error processing children:', e);
}
}
};
// Process all direct children of main
Array.from(main.children).forEach(processNode);
// Add metadata about visual elements
const katexCount = Array.from(main.querySelectorAll('.katex-display')).length;
if (katexCount > 0) {
output.push(`\n\n<!-- Visual elements are numbered globally in DOM order (1, 2, 3...) to match exported screenshots -->\n`);
output.push(`<!-- KaTeX formulas: ${katexCount} formulas exported as N-katex.png where N is the global index -->\n`);
}
return output.join('');
});
}
async function main() {
const cwd = process.cwd();
const args = parseArgs(process.argv);
let outFileBase = args.filename || 'article';
outFileBase = outFileBase.replace(/\.txt$/i, '');
// Build only if dist/ does not exist
const distDir = resolve(cwd, 'dist');
let hasDist = false;
try {
const st = await fs.stat(distDir);
hasDist = st && st.isDirectory();
} catch { }
if (!hasDist) {
console.log('> Building Astro site…');
await run('npm', ['run', 'build']);
} else {
console.log('> Skipping build (dist/ exists)…');
}
console.log('> Starting Astro preview…');
// Capture stdout/stderr to detect the actual port used
let capturedPort = 8080;
let sawPreviewUrl = false;
const maybeCapturePort = (output) => {
const match = output.match(/http:\/\/localhost:(\d+)\//);
if (match) {
capturedPort = parseInt(match[1]);
sawPreviewUrl = true;
}
};
const previewPortEnv = process.env.PREVIEW_PORT ? Number(process.env.PREVIEW_PORT) : null;
if (previewPortEnv) {
capturedPort = previewPortEnv;
sawPreviewUrl = true;
}
const preview = spawn('npm', ['run', 'preview'], {
cwd,
stdio: ['ignore', 'pipe', 'pipe'],
detached: true
});
preview.stdout.on('data', (data) => {
const output = data.toString();
process.stdout.write(output);
maybeCapturePort(output);
});
preview.stderr.on('data', (data) => {
const output = data.toString();
process.stderr.write(output);
maybeCapturePort(output);
});
const previewExit = new Promise((resolvePreview) => {
preview.on('close', (code, signal) => resolvePreview({ code, signal }));
});
const getBaseUrl = () => {
if (!sawPreviewUrl) return null;
return `http://localhost:${capturedPort}/`;
};
try {
await waitForServer(getBaseUrl, 60000);
const baseUrl = getBaseUrl();
console.log(`> Server ready (${baseUrl}), extracting content…`);
const browser = await chromium.launch({ headless: true });
try {
const context = await browser.newContext();
const page = await context.newPage();
// Set viewport
await page.setViewportSize({ width: 1200, height: 1400 });
// Load page (use 'load' instead of 'networkidle' to avoid timeout on heavy pages)
await page.goto(baseUrl, { waitUntil: 'load', timeout: 60000 });
// Wait for content to be ready
await page.waitForTimeout(3000);
// Wait for main content to be present
await page.waitForSelector('main', { timeout: 10000 });
// Get article title for filename
if (!args.filename) {
const title = await page.evaluate(() => {
const h1 = document.querySelector('h1.hero-title');
const t = h1 ? h1.textContent : document.title;
return (t || '').replace(/\s+/g, ' ').trim();
});
outFileBase = slugify(title);
}
console.log('> Extracting article content…');
let txtContent = await extractArticleContent(page);
// Optional code wrapping
const wrapCode = parseBoolean(args['wrap-code'], true);
const codeWidth = Number(args['code-width']) || 80;
if (wrapCode) {
txtContent = wrapCodeBlocksInTxt(txtContent, codeWidth);
console.log(`> Code blocks wrapped at ${codeWidth} columns`);
}
// Write output
const outPath = resolve(cwd, 'dist', `${outFileBase}.txt`);
await fs.writeFile(outPath, txtContent, 'utf-8');
console.log(`✅ TXT exported: ${outPath}`);
// Copy to public folder
const publicPath = resolve(cwd, 'public', `${outFileBase}.txt`);
try {
await fs.mkdir(resolve(cwd, 'public'), { recursive: true });
await fs.copyFile(outPath, publicPath);
console.log(`✅ TXT copied to: ${publicPath}`);
} catch (e) {
console.warn('Unable to copy TXT to public/:', e?.message || e);
}
} finally {
await browser.close();
}
} finally {
// Clean shutdown
try {
if (process.platform !== 'win32') {
try { process.kill(-preview.pid, 'SIGINT'); } catch { }
}
try { preview.kill('SIGINT'); } catch { }
await Promise.race([previewExit, delay(3000)]);
if (!preview.killed) {
try {
if (process.platform !== 'win32') {
try { process.kill(-preview.pid, 'SIGKILL'); } catch { }
}
try { preview.kill('SIGKILL'); } catch { }
} catch { }
await Promise.race([previewExit, delay(1000)]);
}
} catch { }
}
}
main().catch((err) => {
console.error('❌ Error:', err.message);
console.error(err);
process.exit(1);
});