tfrere's picture
tfrere HF Staff
feat: cherry-pick improvements from PR#13
fe5248a
#!/usr/bin/env node
/**
* Export TXT to DOCX format for book publishing
*
* This script converts the exported TXT file to a simple DOCX document:
* - Preserves headings, paragraphs, lists
* - Renders inline formatting: <b> bold, <i> italic, <a> links, <ref> citations
* - Renders <ic> inline code, <il> inline LaTeX
* - Keeps block tags (<f>, <t>, <l>, <n>) with color coding
* - Formats code blocks
* - Creates a clean document ready for further editing
*
* Usage:
* node scripts/export-docx.mjs [--input=path/to/file.txt]
* npm run export:docx
*/
import { Document, Packer, Paragraph, TextRun, HeadingLevel, AlignmentType, LineRuleType } from 'docx';
import { promises as fs } from 'node:fs';
import { resolve } from 'node:path';
import process from 'node:process';
function parseArgs(argv) {
const out = {};
for (const arg of argv.slice(2)) {
if (!arg.startsWith('--')) continue;
const [k, v] = arg.replace(/^--/, '').split('=');
out[k] = v === undefined ? true : v;
}
return out;
}
function detectHeadingLevel(line) {
const match = line.match(/^(#{1,6})\s+(.+)$/);
if (!match) return null;
const level = match[1].length;
const text = match[2].trim();
return { level, text };
}
/**
* Extract simple properties from a TextRun for re-wrapping.
* docx TextRun stores options internally — this grabs what we need.
*/
function extractRunProps(run) {
// TextRun constructor options are stored in run.options (docx ≥ 8)
const opts = run.options || {};
return {
text: opts.text || '',
bold: opts.bold,
italics: opts.italics,
font: opts.font,
color: opts.color,
underline: opts.underline,
superScript: opts.superScript,
shading: opts.shading,
size: opts.size,
};
}
function parseInlineFormatting(text) {
const runs = [];
let pos = 0;
// Match all supported inline tags (including nested content)
// Order matters: longer tag names first to avoid partial matches
const tagPattern = /<(ic|il|ref|b|i|a)(\s[^>]*)?>([^<]*(?:<(?!\/\1>)[^<]*)*)<\/\1>/g;
let match;
while ((match = tagPattern.exec(text)) !== null) {
// Text before the tag
if (match.index > pos) {
const before = text.substring(pos, match.index);
if (before) runs.push(new TextRun(before));
}
const tagType = match[1];
const attrs = match[2] || '';
const content = match[3];
switch (tagType) {
case 'ic':
runs.push(new TextRun({
text: content,
font: 'Courier New',
color: '333333',
shading: { fill: 'E8E8E8', type: 'clear' },
}));
break;
case 'il':
runs.push(new TextRun({
text: content,
italics: true,
color: '0066CC',
}));
break;
case 'b':
// Bold — check for nested tags, otherwise simple bold
if (content.includes('<')) {
// Has nested tags: parse inner content and add bold to each run
for (const innerRun of parseInlineFormatting(content)) {
// Extract properties from existing run and add bold
const props = {};
if (innerRun.properties) Object.assign(props, innerRun.properties);
runs.push(new TextRun({ ...extractRunProps(innerRun), bold: true }));
}
} else {
runs.push(new TextRun({ text: content, bold: true }));
}
break;
case 'i':
if (content.includes('<')) {
for (const innerRun of parseInlineFormatting(content)) {
runs.push(new TextRun({ ...extractRunProps(innerRun), italics: true }));
}
} else {
runs.push(new TextRun({ text: content, italics: true }));
}
break;
case 'a': {
// Link — extract href, render as underlined blue text
const hrefMatch = attrs.match(/href="([^"]*)"/);
const href = hrefMatch ? hrefMatch[1] : '';
runs.push(new TextRun({
text: content,
color: '0066CC',
underline: { type: 'single' },
}));
// Add the URL in parentheses if it's a full URL
if (href && href.startsWith('http')) {
runs.push(new TextRun({
text: ` [${href}]`,
color: '888888',
size: 18,
}));
}
break;
}
case 'ref':
runs.push(new TextRun({
text: content,
superScript: true,
color: '0066CC',
}));
break;
default:
runs.push(new TextRun(match[0]));
}
pos = match.index + match[0].length;
}
// Remaining text after last tag
if (pos < text.length) {
runs.push(new TextRun(text.substring(pos)));
}
return runs.length > 0 ? runs : [new TextRun(text)];
}
/**
* Convert a code block (array of lines) into a DOCX Paragraph with proper
* line breaks. Uses Courier New + gray background shading.
*/
function codeBlockToParagraph(codeLines) {
const runs = [];
for (let i = 0; i < codeLines.length; i++) {
if (i > 0) runs.push(new TextRun({ break: 1 }));
runs.push(new TextRun({
text: codeLines[i],
font: 'Courier New',
size: 18,
color: '333333',
}));
}
return new Paragraph({
children: runs,
shading: { fill: 'F5F5F5', type: 'clear' },
spacing: {
before: 200,
after: 200,
line: 276,
lineRule: LineRuleType.AUTO,
},
});
}
async function convertTxtToDocx(txtPath, outputPath) {
console.log(`📖 Reading TXT file: ${txtPath}`);
const content = await fs.readFile(txtPath, 'utf-8');
const lines = content.split('\n');
const paragraphs = [];
let inCodeBlock = false;
let codeLines = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Skip empty lines unless in code block
if (!line.trim() && !inCodeBlock) {
paragraphs.push(new Paragraph({ text: '' }));
continue;
}
// Handle code blocks <c>...</c>
if (line.trim().startsWith('<c>')) {
inCodeBlock = true;
codeLines = [];
// Single-line code block: <c>code</c>
if (line.trim().endsWith('</c>') && line.trim() !== '<c></c>') {
const inner = line.trim().replace(/^<c>/, '').replace(/<\/c>$/, '');
if (inner) codeLines.push(inner);
paragraphs.push(codeBlockToParagraph(codeLines));
inCodeBlock = false;
codeLines = [];
continue;
}
const firstLine = line.replace(/^<c>\s*/, '').trimStart();
if (firstLine && !firstLine.startsWith('</c>')) {
codeLines.push(firstLine);
}
continue;
}
if (line.trim().endsWith('</c>')) {
const lastLine = line.replace(/<\/c>\s*$/, '');
if (lastLine) codeLines.push(lastLine);
// Add code block as paragraph with proper line breaks
paragraphs.push(codeBlockToParagraph(codeLines));
inCodeBlock = false;
codeLines = [];
continue;
}
if (inCodeBlock) {
codeLines.push(line);
continue;
}
// Handle figure tags <f>...</f>
if (line.trim().startsWith('<f>')) {
paragraphs.push(new Paragraph({
children: [new TextRun({
text: line.trim(),
color: '0066CC',
bold: true
})],
spacing: { before: 200, after: 100 }
}));
continue;
}
// Handle table tags <t>...</t>
if (line.trim().startsWith('<t>')) {
paragraphs.push(new Paragraph({
children: [new TextRun({
text: line.trim(),
color: '009688',
bold: true
})],
spacing: { before: 200, after: 100 }
}));
continue;
}
// Handle LaTeX display tags <l>...</l>
if (line.trim().startsWith('<l>')) {
paragraphs.push(new Paragraph({
children: [new TextRun({
text: line.trim(),
color: '9C27B0',
bold: true
})],
alignment: AlignmentType.CENTER,
spacing: { before: 200, after: 200 }
}));
continue;
}
// Handle note/callout tags <n>...</n>
if (line.trim().startsWith('<n>') && line.trim().endsWith('</n>')) {
const inner = line.trim().replace(/^<n>/, '').replace(/<\/n>$/, '');
paragraphs.push(new Paragraph({
children: parseInlineFormatting(inner),
indent: { left: 360 },
shading: { fill: 'FFF8E1', type: 'clear' },
spacing: { before: 200, after: 200 },
}));
continue;
}
// Handle headings
const heading = detectHeadingLevel(line);
if (heading) {
const headingLevels = {
1: HeadingLevel.HEADING_1,
2: HeadingLevel.HEADING_2,
3: HeadingLevel.HEADING_3,
4: HeadingLevel.HEADING_4,
5: HeadingLevel.HEADING_5,
6: HeadingLevel.HEADING_6
};
paragraphs.push(new Paragraph({
text: heading.text,
heading: headingLevels[heading.level],
spacing: { before: 400, after: 200 }
}));
continue;
}
// Handle list items
if (line.trim().startsWith('- ')) {
const text = line.trim().substring(2);
paragraphs.push(new Paragraph({
children: parseInlineFormatting(text),
bullet: { level: 0 },
spacing: { before: 100, after: 100 }
}));
continue;
}
// Handle numbered lists
const numberedMatch = line.trim().match(/^(\d+)\.\s+(.+)$/);
if (numberedMatch) {
const text = numberedMatch[2];
paragraphs.push(new Paragraph({
children: parseInlineFormatting(text),
numbering: { reference: 'default-numbering', level: 0 },
spacing: { before: 100, after: 100 }
}));
continue;
}
// Handle blockquotes
if (line.trim().startsWith('> ')) {
const text = line.trim().substring(2);
paragraphs.push(new Paragraph({
children: parseInlineFormatting(text),
italics: true,
indent: { left: 720 },
spacing: { before: 200, after: 200 }
}));
continue;
}
// Regular paragraph
if (line.trim()) {
paragraphs.push(new Paragraph({
children: parseInlineFormatting(line.trim()),
spacing: { before: 100, after: 100 }
}));
}
}
console.log(`📝 Creating DOCX with ${paragraphs.length} paragraphs...`);
const doc = new Document({
sections: [{
properties: {},
children: paragraphs
}]
});
console.log(`💾 Writing DOCX to: ${outputPath}`);
const buffer = await Packer.toBuffer(doc);
await fs.writeFile(outputPath, buffer);
console.log(`✅ DOCX created successfully!`);
}
async function main() {
const cwd = process.cwd();
const args = parseArgs(process.argv);
const inputPath = args.input || resolve(cwd, 'dist', 'the-smol-training-playbook-the-secrets-to-building-world-class-llms.txt');
const outputPath = args.output || inputPath.replace('.txt', '.docx');
// Check if input exists
try {
await fs.access(inputPath);
} catch {
console.error(`❌ Error: Input file not found: ${inputPath}`);
console.error(' Run "npm run export:txt" first to generate the TXT file.');
process.exit(1);
}
await convertTxtToDocx(inputPath, outputPath);
// Also copy to public folder
const publicPath = outputPath.replace('/dist/', '/public/');
try {
await fs.mkdir(resolve(cwd, 'public'), { recursive: true });
await fs.copyFile(outputPath, publicPath);
console.log(`✅ DOCX copied to: ${publicPath}`);
} catch (e) {
console.warn('Unable to copy DOCX to public/:', e?.message || e);
}
}
main().catch((err) => {
console.error('❌ Error:', err.message);
console.error(err);
process.exit(1);
});