smol-training-playbook

Running on CPU Upgrade

File size: 11,783 Bytes

#!/usr/bin/env node

/**
 * Export TXT to DOCX format for book publishing
 * 
 * This script converts the exported TXT file to a simple DOCX document:
 * - Preserves headings, paragraphs, lists
 * - Renders inline formatting: <b> bold, <i> italic, <a> links, <ref> citations
 * - Renders <ic> inline code, <il> inline LaTeX
 * - Keeps block tags (<f>, <t>, <l>, <n>) with color coding
 * - Formats code blocks
 * - Creates a clean document ready for further editing
 * 
 * Usage:
 *   node scripts/export-docx.mjs [--input=path/to/file.txt]
 *   npm run export:docx
 */

import { Document, Packer, Paragraph, TextRun, HeadingLevel, AlignmentType, LineRuleType } from 'docx';
import { promises as fs } from 'node:fs';
import { resolve } from 'node:path';
import process from 'node:process';

function parseArgs(argv) {
  const out = {};
  for (const arg of argv.slice(2)) {
    if (!arg.startsWith('--')) continue;
    const [k, v] = arg.replace(/^--/, '').split('=');
    out[k] = v === undefined ? true : v;
  }
  return out;
}

function detectHeadingLevel(line) {
  const match = line.match(/^(#{1,6})\s+(.+)$/);
  if (!match) return null;
  const level = match[1].length;
  const text = match[2].trim();
  return { level, text };
}

/**
 * Extract simple properties from a TextRun for re-wrapping.
 * docx TextRun stores options internally — this grabs what we need.
 */
function extractRunProps(run) {
  // TextRun constructor options are stored in run.options (docx ≥ 8)
  const opts = run.options || {};
  return {
    text: opts.text || '',
    bold: opts.bold,
    italics: opts.italics,
    font: opts.font,
    color: opts.color,
    underline: opts.underline,
    superScript: opts.superScript,
    shading: opts.shading,
    size: opts.size,
  };
}

function parseInlineFormatting(text) {
  const runs = [];
  let pos = 0;

  // Match all supported inline tags (including nested content)
  // Order matters: longer tag names first to avoid partial matches
  const tagPattern = /<(ic|il|ref|b|i|a)(\s[^>]*)?>([^<]*(?:<(?!\/\1>)[^<]*)*)<\/\1>/g;
  let match;

  while ((match = tagPattern.exec(text)) !== null) {
    // Text before the tag
    if (match.index > pos) {
      const before = text.substring(pos, match.index);
      if (before) runs.push(new TextRun(before));
    }

    const tagType = match[1];
    const attrs = match[2] || '';
    const content = match[3];

    switch (tagType) {
      case 'ic':
        runs.push(new TextRun({
          text: content,
          font: 'Courier New',
          color: '333333',
          shading: { fill: 'E8E8E8', type: 'clear' },
        }));
        break;

      case 'il':
        runs.push(new TextRun({
          text: content,
          italics: true,
          color: '0066CC',
        }));
        break;

      case 'b':
        // Bold — check for nested tags, otherwise simple bold
        if (content.includes('<')) {
          // Has nested tags: parse inner content and add bold to each run
          for (const innerRun of parseInlineFormatting(content)) {
            // Extract properties from existing run and add bold
            const props = {};
            if (innerRun.properties) Object.assign(props, innerRun.properties);
            runs.push(new TextRun({ ...extractRunProps(innerRun), bold: true }));
          }
        } else {
          runs.push(new TextRun({ text: content, bold: true }));
        }
        break;

      case 'i':
        if (content.includes('<')) {
          for (const innerRun of parseInlineFormatting(content)) {
            runs.push(new TextRun({ ...extractRunProps(innerRun), italics: true }));
          }
        } else {
          runs.push(new TextRun({ text: content, italics: true }));
        }
        break;

      case 'a': {
        // Link — extract href, render as underlined blue text
        const hrefMatch = attrs.match(/href="([^"]*)"/);
        const href = hrefMatch ? hrefMatch[1] : '';
        runs.push(new TextRun({
          text: content,
          color: '0066CC',
          underline: { type: 'single' },
        }));
        // Add the URL in parentheses if it's a full URL
        if (href && href.startsWith('http')) {
          runs.push(new TextRun({
            text: ` [${href}]`,
            color: '888888',
            size: 18,
          }));
        }
        break;
      }

      case 'ref':
        runs.push(new TextRun({
          text: content,
          superScript: true,
          color: '0066CC',
        }));
        break;

      default:
        runs.push(new TextRun(match[0]));
    }

    pos = match.index + match[0].length;
  }

  // Remaining text after last tag
  if (pos < text.length) {
    runs.push(new TextRun(text.substring(pos)));
  }

  return runs.length > 0 ? runs : [new TextRun(text)];
}

/**
 * Convert a code block (array of lines) into a DOCX Paragraph with proper
 * line breaks. Uses Courier New + gray background shading.
 */
function codeBlockToParagraph(codeLines) {
  const runs = [];
  for (let i = 0; i < codeLines.length; i++) {
    if (i > 0) runs.push(new TextRun({ break: 1 }));
    runs.push(new TextRun({
      text: codeLines[i],
      font: 'Courier New',
      size: 18,
      color: '333333',
    }));
  }
  return new Paragraph({
    children: runs,
    shading: { fill: 'F5F5F5', type: 'clear' },
    spacing: {
      before: 200,
      after: 200,
      line: 276,
      lineRule: LineRuleType.AUTO,
    },
  });
}

async function convertTxtToDocx(txtPath, outputPath) {
  console.log(`📖 Reading TXT file: ${txtPath}`);
  const content = await fs.readFile(txtPath, 'utf-8');
  const lines = content.split('\n');
  
  const paragraphs = [];
  let inCodeBlock = false;
  let codeLines = [];
  
  for (let i = 0; i < lines.length; i++) {
    const line = lines[i];
    
    // Skip empty lines unless in code block
    if (!line.trim() && !inCodeBlock) {
      paragraphs.push(new Paragraph({ text: '' }));
      continue;
    }
    
    // Handle code blocks <c>...</c>
    if (line.trim().startsWith('<c>')) {
      inCodeBlock = true;
      codeLines = [];
      // Single-line code block: <c>code</c>
      if (line.trim().endsWith('</c>') && line.trim() !== '<c></c>') {
        const inner = line.trim().replace(/^<c>/, '').replace(/<\/c>$/, '');
        if (inner) codeLines.push(inner);
        paragraphs.push(codeBlockToParagraph(codeLines));
        inCodeBlock = false;
        codeLines = [];
        continue;
      }
      const firstLine = line.replace(/^<c>\s*/, '').trimStart();
      if (firstLine && !firstLine.startsWith('</c>')) {
        codeLines.push(firstLine);
      }
      continue;
    }
    
    if (line.trim().endsWith('</c>')) {
      const lastLine = line.replace(/<\/c>\s*$/, '');
      if (lastLine) codeLines.push(lastLine);
      
      // Add code block as paragraph with proper line breaks
      paragraphs.push(codeBlockToParagraph(codeLines));
      
      inCodeBlock = false;
      codeLines = [];
      continue;
    }
    
    if (inCodeBlock) {
      codeLines.push(line);
      continue;
    }
    
    // Handle figure tags <f>...</f>
    if (line.trim().startsWith('<f>')) {
      paragraphs.push(new Paragraph({
        children: [new TextRun({
          text: line.trim(),
          color: '0066CC',
          bold: true
        })],
        spacing: { before: 200, after: 100 }
      }));
      continue;
    }
    
    // Handle table tags <t>...</t>
    if (line.trim().startsWith('<t>')) {
      paragraphs.push(new Paragraph({
        children: [new TextRun({
          text: line.trim(),
          color: '009688',
          bold: true
        })],
        spacing: { before: 200, after: 100 }
      }));
      continue;
    }
    
    // Handle LaTeX display tags <l>...</l>
    if (line.trim().startsWith('<l>')) {
      paragraphs.push(new Paragraph({
        children: [new TextRun({
          text: line.trim(),
          color: '9C27B0',
          bold: true
        })],
        alignment: AlignmentType.CENTER,
        spacing: { before: 200, after: 200 }
      }));
      continue;
    }
    
    // Handle note/callout tags <n>...</n>
    if (line.trim().startsWith('<n>') && line.trim().endsWith('</n>')) {
      const inner = line.trim().replace(/^<n>/, '').replace(/<\/n>$/, '');
      paragraphs.push(new Paragraph({
        children: parseInlineFormatting(inner),
        indent: { left: 360 },
        shading: { fill: 'FFF8E1', type: 'clear' },
        spacing: { before: 200, after: 200 },
      }));
      continue;
    }
    
    // Handle headings
    const heading = detectHeadingLevel(line);
    if (heading) {
      const headingLevels = {
        1: HeadingLevel.HEADING_1,
        2: HeadingLevel.HEADING_2,
        3: HeadingLevel.HEADING_3,
        4: HeadingLevel.HEADING_4,
        5: HeadingLevel.HEADING_5,
        6: HeadingLevel.HEADING_6
      };
      
      paragraphs.push(new Paragraph({
        text: heading.text,
        heading: headingLevels[heading.level],
        spacing: { before: 400, after: 200 }
      }));
      continue;
    }
    
    // Handle list items
    if (line.trim().startsWith('- ')) {
      const text = line.trim().substring(2);
      paragraphs.push(new Paragraph({
        children: parseInlineFormatting(text),
        bullet: { level: 0 },
        spacing: { before: 100, after: 100 }
      }));
      continue;
    }
    
    // Handle numbered lists
    const numberedMatch = line.trim().match(/^(\d+)\.\s+(.+)$/);
    if (numberedMatch) {
      const text = numberedMatch[2];
      paragraphs.push(new Paragraph({
        children: parseInlineFormatting(text),
        numbering: { reference: 'default-numbering', level: 0 },
        spacing: { before: 100, after: 100 }
      }));
      continue;
    }
    
    // Handle blockquotes
    if (line.trim().startsWith('> ')) {
      const text = line.trim().substring(2);
      paragraphs.push(new Paragraph({
        children: parseInlineFormatting(text),
        italics: true,
        indent: { left: 720 },
        spacing: { before: 200, after: 200 }
      }));
      continue;
    }
    
    // Regular paragraph
    if (line.trim()) {
      paragraphs.push(new Paragraph({
        children: parseInlineFormatting(line.trim()),
        spacing: { before: 100, after: 100 }
      }));
    }
  }
  
  console.log(`📝 Creating DOCX with ${paragraphs.length} paragraphs...`);
  
  const doc = new Document({
    sections: [{
      properties: {},
      children: paragraphs
    }]
  });
  
  console.log(`💾 Writing DOCX to: ${outputPath}`);
  const buffer = await Packer.toBuffer(doc);
  await fs.writeFile(outputPath, buffer);
  
  console.log(`✅ DOCX created successfully!`);
}

async function main() {
  const cwd = process.cwd();
  const args = parseArgs(process.argv);
  
  const inputPath = args.input || resolve(cwd, 'dist', 'the-smol-training-playbook-the-secrets-to-building-world-class-llms.txt');
  const outputPath = args.output || inputPath.replace('.txt', '.docx');
  
  // Check if input exists
  try {
    await fs.access(inputPath);
  } catch {
    console.error(`❌ Error: Input file not found: ${inputPath}`);
    console.error('   Run "npm run export:txt" first to generate the TXT file.');
    process.exit(1);
  }
  
  await convertTxtToDocx(inputPath, outputPath);
  
  // Also copy to public folder
  const publicPath = outputPath.replace('/dist/', '/public/');
  try {
    await fs.mkdir(resolve(cwd, 'public'), { recursive: true });
    await fs.copyFile(outputPath, publicPath);
    console.log(`✅ DOCX copied to: ${publicPath}`);
  } catch (e) {
    console.warn('Unable to copy DOCX to public/:', e?.message || e);
  }
}

main().catch((err) => {
  console.error('❌ Error:', err.message);
  console.error(err);
  process.exit(1);
});