Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Fix exporter
#13
by
lewtun HF Staff - opened
- app/scripts/README-TXT-EXPORT.md +13 -3
- app/scripts/export-docx.mjs +34 -12
- app/scripts/export-txt.mjs +188 -26
- app/scripts/screenshot-elements.mjs +130 -14
app/scripts/README-TXT-EXPORT.md
CHANGED
|
@@ -5,13 +5,23 @@ This script exports the article to a simple text format suitable for book publis
|
|
| 5 |
## Usage
|
| 6 |
|
| 7 |
```bash
|
| 8 |
-
|
| 9 |
```
|
| 10 |
|
| 11 |
Or with custom filename:
|
| 12 |
|
| 13 |
```bash
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
```
|
| 16 |
|
| 17 |
## Output
|
|
@@ -65,7 +75,7 @@ Example:
|
|
| 65 |
```
|
| 66 |
Example:
|
| 67 |
```
|
| 68 |
-
Use the <ic>
|
| 69 |
```
|
| 70 |
|
| 71 |
#### LaTeX Formulas
|
|
|
|
| 5 |
## Usage
|
| 6 |
|
| 7 |
```bash
|
| 8 |
+
yarn export:txt
|
| 9 |
```
|
| 10 |
|
| 11 |
Or with custom filename:
|
| 12 |
|
| 13 |
```bash
|
| 14 |
+
yarn export:txt -- --filename=my-article
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
By default, code blocks inside `<c>...</c>` are hard-wrapped to 80 characters per line. You can disable or configure this:
|
| 18 |
+
|
| 19 |
+
```bash
|
| 20 |
+
# Disable wrapping
|
| 21 |
+
yarn export:txt -- --wrap-code=false
|
| 22 |
+
|
| 23 |
+
# Change wrap width
|
| 24 |
+
yarn export:txt -- --code-width=100
|
| 25 |
```
|
| 26 |
|
| 27 |
## Output
|
|
|
|
| 75 |
```
|
| 76 |
Example:
|
| 77 |
```
|
| 78 |
+
Use the <ic>yarn install</ic> command to install dependencies.
|
| 79 |
```
|
| 80 |
|
| 81 |
#### LaTeX Formulas
|
app/scripts/export-docx.mjs
CHANGED
|
@@ -14,7 +14,7 @@
|
|
| 14 |
* npm run export:docx
|
| 15 |
*/
|
| 16 |
|
| 17 |
-
import { Document, Packer, Paragraph, TextRun, HeadingLevel, AlignmentType } from 'docx';
|
| 18 |
import { promises as fs } from 'node:fs';
|
| 19 |
import { resolve } from 'node:path';
|
| 20 |
import process from 'node:process';
|
|
@@ -93,6 +93,21 @@ function parseInlineFormatting(text) {
|
|
| 93 |
return runs.length > 0 ? runs : [new TextRun(text)];
|
| 94 |
}
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
async function convertTxtToDocx(txtPath, outputPath) {
|
| 97 |
console.log(`📖 Reading TXT file: ${txtPath}`);
|
| 98 |
const content = await fs.readFile(txtPath, 'utf-8');
|
|
@@ -112,12 +127,25 @@ async function convertTxtToDocx(txtPath, outputPath) {
|
|
| 112 |
}
|
| 113 |
|
| 114 |
// Handle code blocks <c>...</c>
|
| 115 |
-
if (line
|
| 116 |
inCodeBlock = true;
|
| 117 |
codeLines = [];
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
}
|
| 122 |
continue;
|
| 123 |
}
|
|
@@ -128,13 +156,7 @@ async function convertTxtToDocx(txtPath, outputPath) {
|
|
| 128 |
|
| 129 |
// Add code block as paragraph(s)
|
| 130 |
if (codeLines.length > 0) {
|
| 131 |
-
paragraphs.push(
|
| 132 |
-
text: codeLines.join('\n'),
|
| 133 |
-
font: 'Courier New',
|
| 134 |
-
size: 20,
|
| 135 |
-
shading: { fill: 'F5F5F5', type: 'clear' },
|
| 136 |
-
spacing: { before: 200, after: 200 }
|
| 137 |
-
}));
|
| 138 |
}
|
| 139 |
|
| 140 |
inCodeBlock = false;
|
|
|
|
| 14 |
* npm run export:docx
|
| 15 |
*/
|
| 16 |
|
| 17 |
+
import { Document, Packer, Paragraph, TextRun, HeadingLevel, AlignmentType, LineRuleType } from 'docx';
|
| 18 |
import { promises as fs } from 'node:fs';
|
| 19 |
import { resolve } from 'node:path';
|
| 20 |
import process from 'node:process';
|
|
|
|
| 93 |
return runs.length > 0 ? runs : [new TextRun(text)];
|
| 94 |
}
|
| 95 |
|
| 96 |
+
function codeBlockToParagraph(codeLines) {
|
| 97 |
+
const codeRuns = codeLines.map((codeLine, idx) => new TextRun({
|
| 98 |
+
break: idx === 0 ? 0 : 1,
|
| 99 |
+
children: [codeLine],
|
| 100 |
+
font: 'Courier New',
|
| 101 |
+
size: 20
|
| 102 |
+
}));
|
| 103 |
+
|
| 104 |
+
return new Paragraph({
|
| 105 |
+
children: codeRuns,
|
| 106 |
+
shading: { fill: 'F5F5F5', type: 'clear' },
|
| 107 |
+
spacing: { before: 200, after: 200, line: 240, lineRule: LineRuleType.AUTO }
|
| 108 |
+
});
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
async function convertTxtToDocx(txtPath, outputPath) {
|
| 112 |
console.log(`📖 Reading TXT file: ${txtPath}`);
|
| 113 |
const content = await fs.readFile(txtPath, 'utf-8');
|
|
|
|
| 127 |
}
|
| 128 |
|
| 129 |
// Handle code blocks <c>...</c>
|
| 130 |
+
if (/^\s*<c>/.test(line)) {
|
| 131 |
inCodeBlock = true;
|
| 132 |
codeLines = [];
|
| 133 |
+
|
| 134 |
+
const firstLine = line.replace(/^\s*<c>/, '');
|
| 135 |
+
if (firstLine) {
|
| 136 |
+
// Handle single-line code blocks like: <c>code...</c>
|
| 137 |
+
if (firstLine.trim().endsWith('</c>')) {
|
| 138 |
+
const singleLine = firstLine.replace(/<\/c>\s*$/, '');
|
| 139 |
+
codeLines.push(singleLine);
|
| 140 |
+
paragraphs.push(codeBlockToParagraph(codeLines));
|
| 141 |
+
inCodeBlock = false;
|
| 142 |
+
codeLines = [];
|
| 143 |
+
continue;
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
if (!firstLine.startsWith('</c>')) {
|
| 147 |
+
codeLines.push(firstLine);
|
| 148 |
+
}
|
| 149 |
}
|
| 150 |
continue;
|
| 151 |
}
|
|
|
|
| 156 |
|
| 157 |
// Add code block as paragraph(s)
|
| 158 |
if (codeLines.length > 0) {
|
| 159 |
+
paragraphs.push(codeBlockToParagraph(codeLines));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
}
|
| 161 |
|
| 162 |
inCodeBlock = false;
|
app/scripts/export-txt.mjs
CHANGED
|
@@ -35,16 +35,23 @@ async function run(command, args = [], options = {}) {
|
|
| 35 |
});
|
| 36 |
}
|
| 37 |
|
| 38 |
-
async function waitForServer(
|
|
|
|
| 39 |
const start = Date.now();
|
| 40 |
while (Date.now() - start < timeoutMs) {
|
| 41 |
try {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
const res = await fetch(url);
|
| 43 |
if (res.ok) return;
|
| 44 |
} catch { }
|
| 45 |
await delay(500);
|
| 46 |
}
|
| 47 |
-
|
|
|
|
| 48 |
}
|
| 49 |
|
| 50 |
function parseArgs(argv) {
|
|
@@ -57,6 +64,15 @@ function parseArgs(argv) {
|
|
| 57 |
return out;
|
| 58 |
}
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
function slugify(text) {
|
| 61 |
return String(text || '')
|
| 62 |
.normalize('NFKD')
|
|
@@ -99,6 +115,82 @@ function headingToMarkdown(level, text) {
|
|
| 99 |
return `${hashes} ${text}`;
|
| 100 |
}
|
| 101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
/**
|
| 103 |
* Extract and convert article content to TXT format
|
| 104 |
*/
|
|
@@ -156,8 +248,12 @@ async function extractArticleContent(page) {
|
|
| 156 |
const main = document.querySelector('main');
|
| 157 |
if (!main) return 'Error: main element not found';
|
| 158 |
|
| 159 |
-
// Helper: get all visual elements in DOM order (
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
const elementIndexMap = new Map();
|
| 162 |
|
| 163 |
// Pre-process: assign global indices to visual elements
|
|
@@ -168,7 +264,41 @@ async function extractArticleContent(page) {
|
|
| 168 |
// Walk through all child nodes
|
| 169 |
const processNode = (node) => {
|
| 170 |
const tag = node.tagName?.toLowerCase();
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
// Headings
|
| 173 |
if (/^h[1-6]$/.test(tag)) {
|
| 174 |
const level = parseInt(tag[1]);
|
|
@@ -291,9 +421,20 @@ async function extractArticleContent(page) {
|
|
| 291 |
|
| 292 |
// Figures (images, embeds)
|
| 293 |
if (tag === 'figure') {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
const img = node.querySelector('img');
|
| 295 |
-
const htmlEmbed =
|
| 296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
const caption = node.querySelector('figcaption');
|
| 298 |
|
| 299 |
// Skip if it's not really a figure (no img, no embed, no caption)
|
|
@@ -358,14 +499,15 @@ async function extractArticleContent(page) {
|
|
| 358 |
|
| 359 |
// Notes (Note component and Sidenote)
|
| 360 |
if (node.classList?.contains('note') || node.classList?.contains('sidenote')) {
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
|
|
|
| 369 |
return;
|
| 370 |
}
|
| 371 |
|
|
@@ -418,6 +560,22 @@ async function main() {
|
|
| 418 |
console.log('> Starting Astro preview…');
|
| 419 |
// Capture stdout to detect the actual port used
|
| 420 |
let capturedPort = 8080;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
const preview = spawn('npm', ['run', 'preview'], {
|
| 422 |
cwd,
|
| 423 |
stdio: ['ignore', 'pipe', 'pipe'],
|
|
@@ -428,27 +586,28 @@ async function main() {
|
|
| 428 |
preview.stdout.on('data', (data) => {
|
| 429 |
const output = data.toString();
|
| 430 |
process.stdout.write(output);
|
| 431 |
-
|
| 432 |
-
if (match) {
|
| 433 |
-
capturedPort = parseInt(match[1]);
|
| 434 |
-
}
|
| 435 |
});
|
| 436 |
|
| 437 |
preview.stderr.on('data', (data) => {
|
| 438 |
-
|
|
|
|
|
|
|
| 439 |
});
|
| 440 |
|
| 441 |
const previewExit = new Promise((resolvePreview) => {
|
| 442 |
preview.on('close', (code, signal) => resolvePreview({ code, signal }));
|
| 443 |
});
|
| 444 |
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
|
|
|
| 448 |
|
| 449 |
try {
|
| 450 |
-
await waitForServer(
|
| 451 |
-
|
|
|
|
| 452 |
|
| 453 |
const browser = await chromium.launch({ headless: true });
|
| 454 |
try {
|
|
@@ -479,10 +638,13 @@ async function main() {
|
|
| 479 |
|
| 480 |
console.log('> Extracting article content…');
|
| 481 |
const txtContent = await extractArticleContent(page);
|
|
|
|
|
|
|
|
|
|
| 482 |
|
| 483 |
// Write output
|
| 484 |
const outPath = resolve(cwd, 'dist', `${outFileBase}.txt`);
|
| 485 |
-
await fs.writeFile(outPath,
|
| 486 |
console.log(`✅ TXT exported: ${outPath}`);
|
| 487 |
|
| 488 |
// Copy to public folder
|
|
|
|
| 35 |
});
|
| 36 |
}
|
| 37 |
|
| 38 |
+
async function waitForServer(urlOrFn, timeoutMs = 60000) {
|
| 39 |
+
const getUrl = typeof urlOrFn === 'function' ? urlOrFn : () => urlOrFn;
|
| 40 |
const start = Date.now();
|
| 41 |
while (Date.now() - start < timeoutMs) {
|
| 42 |
try {
|
| 43 |
+
const url = getUrl();
|
| 44 |
+
if (!url) {
|
| 45 |
+
await delay(200);
|
| 46 |
+
continue;
|
| 47 |
+
}
|
| 48 |
const res = await fetch(url);
|
| 49 |
if (res.ok) return;
|
| 50 |
} catch { }
|
| 51 |
await delay(500);
|
| 52 |
}
|
| 53 |
+
const lastUrl = getUrl();
|
| 54 |
+
throw new Error(`Server did not start in time: ${lastUrl || '(unknown url)'}`);
|
| 55 |
}
|
| 56 |
|
| 57 |
function parseArgs(argv) {
|
|
|
|
| 64 |
return out;
|
| 65 |
}
|
| 66 |
|
| 67 |
+
function parseBoolean(value, defaultValue) {
|
| 68 |
+
if (value === undefined) return defaultValue;
|
| 69 |
+
if (value === true) return true;
|
| 70 |
+
const v = String(value).trim().toLowerCase();
|
| 71 |
+
if (['1', 'true', 'yes', 'y', 'on'].includes(v)) return true;
|
| 72 |
+
if (['0', 'false', 'no', 'n', 'off'].includes(v)) return false;
|
| 73 |
+
return defaultValue;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
function slugify(text) {
|
| 77 |
return String(text || '')
|
| 78 |
.normalize('NFKD')
|
|
|
|
| 115 |
return `${hashes} ${text}`;
|
| 116 |
}
|
| 117 |
|
| 118 |
+
function wrapLineWithIndent(line, maxWidth) {
|
| 119 |
+
if (line.length <= maxWidth) return [line];
|
| 120 |
+
|
| 121 |
+
const indentMatch = line.match(/^\s*/);
|
| 122 |
+
const indent = indentMatch ? indentMatch[0] : '';
|
| 123 |
+
const indentLen = indent.length;
|
| 124 |
+
const available = Math.max(10, maxWidth - indentLen);
|
| 125 |
+
|
| 126 |
+
let rest = line.slice(indentLen);
|
| 127 |
+
const out = [];
|
| 128 |
+
|
| 129 |
+
while (rest.length > available) {
|
| 130 |
+
let breakPos = -1;
|
| 131 |
+
for (let i = available; i >= 1; i--) {
|
| 132 |
+
if (/\s/.test(rest[i - 1])) {
|
| 133 |
+
breakPos = i;
|
| 134 |
+
break;
|
| 135 |
+
}
|
| 136 |
+
}
|
| 137 |
+
if (breakPos === -1) breakPos = available;
|
| 138 |
+
|
| 139 |
+
const chunk = rest.slice(0, breakPos).replace(/\s+$/g, '');
|
| 140 |
+
out.push(indent + chunk);
|
| 141 |
+
rest = rest.slice(breakPos).replace(/^\s+/g, '');
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
out.push(indent + rest);
|
| 145 |
+
return out;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
function wrapCodeTextAccountingForTags(codeText, maxWidth) {
|
| 149 |
+
const width = Number(maxWidth);
|
| 150 |
+
if (!Number.isFinite(width) || width <= 0) return String(codeText || '');
|
| 151 |
+
|
| 152 |
+
const baseLines = String(codeText || '').split('\n');
|
| 153 |
+
const wrappedLines = [];
|
| 154 |
+
for (const line of baseLines) wrappedLines.push(...wrapLineWithIndent(line, width));
|
| 155 |
+
|
| 156 |
+
if (wrappedLines.length === 0) return '';
|
| 157 |
+
|
| 158 |
+
// If the whole block is a single line, account for both "<c>" and "</c>" on that line.
|
| 159 |
+
if (wrappedLines.length === 1) {
|
| 160 |
+
const maxInner = width - '<c>'.length - '</c>'.length;
|
| 161 |
+
if (wrappedLines[0].length > maxInner && maxInner > 0) {
|
| 162 |
+
return wrapLineWithIndent(wrappedLines[0], maxInner).join('\n');
|
| 163 |
+
}
|
| 164 |
+
return wrappedLines[0];
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
// Otherwise, "<c>" applies to the first line and "</c>" to the last line only.
|
| 168 |
+
const firstMaxInner = width - '<c>'.length;
|
| 169 |
+
if (firstMaxInner > 0 && wrappedLines[0].length > firstMaxInner) {
|
| 170 |
+
const rewrappedFirst = wrapLineWithIndent(wrappedLines[0], firstMaxInner);
|
| 171 |
+
wrappedLines.splice(0, 1, ...rewrappedFirst);
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
const lastMaxInner = width - '</c>'.length;
|
| 175 |
+
const lastIdx = wrappedLines.length - 1;
|
| 176 |
+
if (lastMaxInner > 0 && wrappedLines[lastIdx].length > lastMaxInner) {
|
| 177 |
+
const rewrappedLast = wrapLineWithIndent(wrappedLines[lastIdx], lastMaxInner);
|
| 178 |
+
wrappedLines.splice(lastIdx, 1, ...rewrappedLast);
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
return wrappedLines.join('\n');
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
function wrapCodeBlocksInTxt(txt, maxWidth = 80) {
|
| 185 |
+
const width = Number(maxWidth);
|
| 186 |
+
if (!Number.isFinite(width) || width <= 0) return txt;
|
| 187 |
+
|
| 188 |
+
return String(txt || '').replace(/<c>([\s\S]*?)<\/c>/g, (_m, inner) => {
|
| 189 |
+
const wrappedInner = wrapCodeTextAccountingForTags(inner, width);
|
| 190 |
+
return `<c>${wrappedInner}</c>`;
|
| 191 |
+
});
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
/**
|
| 195 |
* Extract and convert article content to TXT format
|
| 196 |
*/
|
|
|
|
| 248 |
const main = document.querySelector('main');
|
| 249 |
if (!main) return 'Error: main element not found';
|
| 250 |
|
| 251 |
+
// Helper: get all visual elements in DOM order (match screenshot-elements.mjs)
|
| 252 |
+
// NOTE: Don't include generic `figure` here. The screenshot script indexes only
|
| 253 |
+
// `.html-embed`, `.table-scroll > table`, `.image-wrapper`, and `.katex-display`.
|
| 254 |
+
const allVisualElements = Array.from(
|
| 255 |
+
document.querySelectorAll('.html-embed, .table-scroll > table, .image-wrapper, .katex-display'),
|
| 256 |
+
);
|
| 257 |
const elementIndexMap = new Map();
|
| 258 |
|
| 259 |
// Pre-process: assign global indices to visual elements
|
|
|
|
| 264 |
// Walk through all child nodes
|
| 265 |
const processNode = (node) => {
|
| 266 |
const tag = node.tagName?.toLowerCase();
|
| 267 |
+
|
| 268 |
+
// Images (Image.astro renders a `.image-wrapper` container; it may or may not contain a <figure>)
|
| 269 |
+
if (node.classList?.contains('image-wrapper')) {
|
| 270 |
+
const globalIndex = elementIndexMap.get(node);
|
| 271 |
+
if (!globalIndex) return;
|
| 272 |
+
|
| 273 |
+
const img = node.querySelector('img');
|
| 274 |
+
const figure = node.querySelector('figure');
|
| 275 |
+
const caption = figure?.querySelector('figcaption') || node.querySelector('figcaption');
|
| 276 |
+
|
| 277 |
+
let name = '';
|
| 278 |
+
let anchor = '';
|
| 279 |
+
let description = '';
|
| 280 |
+
|
| 281 |
+
// Prefer an explicit figure ID (used for cross-references), otherwise wrapper ID.
|
| 282 |
+
if (figure?.id) anchor = figure.id;
|
| 283 |
+
else if (node.id) anchor = node.id;
|
| 284 |
+
|
| 285 |
+
if (caption) {
|
| 286 |
+
const captionText = stripHtml(caption.innerHTML);
|
| 287 |
+
const parsed = parseCaptionText(captionText, 'Figure');
|
| 288 |
+
name = parsed.name;
|
| 289 |
+
description = parsed.description;
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
if (!description && img?.alt) description = img.alt;
|
| 293 |
+
if (!name) name = `image-${globalIndex}`;
|
| 294 |
+
|
| 295 |
+
const parts = [name];
|
| 296 |
+
if (anchor) parts.push(anchor);
|
| 297 |
+
if (description) parts.push(description);
|
| 298 |
+
output.push(`<f>${parts.join(' | ')}</f>\n\n`);
|
| 299 |
+
return;
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
// Headings
|
| 303 |
if (/^h[1-6]$/.test(tag)) {
|
| 304 |
const level = parseInt(tag[1]);
|
|
|
|
| 421 |
|
| 422 |
// Figures (images, embeds)
|
| 423 |
if (tag === 'figure') {
|
| 424 |
+
// If this <figure> is inside an `.image-wrapper`, the wrapper handler above will emit
|
| 425 |
+
// a single <f> tag for the whole image. Avoid double-emitting.
|
| 426 |
+
if (node.closest?.('.image-wrapper')) {
|
| 427 |
+
const isHtmlEmbedFigure = node.matches?.('.html-embed, .html-embed--screenshot');
|
| 428 |
+
if (!isHtmlEmbedFigure) return;
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
const img = node.querySelector('img');
|
| 432 |
+
const htmlEmbed =
|
| 433 |
+
node.matches?.('.html-embed, .html-embed--screenshot')
|
| 434 |
+
? node
|
| 435 |
+
: node.querySelector('.html-embed, .html-embed--screenshot');
|
| 436 |
+
// Images are wrapped in a sibling/parent `.image-wrapper` container in this codebase
|
| 437 |
+
const imageWrapper = node.closest?.('.image-wrapper') || null;
|
| 438 |
const caption = node.querySelector('figcaption');
|
| 439 |
|
| 440 |
// Skip if it's not really a figure (no img, no embed, no caption)
|
|
|
|
| 499 |
|
| 500 |
// Notes (Note component and Sidenote)
|
| 501 |
if (node.classList?.contains('note') || node.classList?.contains('sidenote')) {
|
| 502 |
+
// For Note.astro, avoid duplicating the header/title in exported text by
|
| 503 |
+
// extracting only the body content.
|
| 504 |
+
const contentNode =
|
| 505 |
+
node.classList?.contains('note')
|
| 506 |
+
? (node.querySelector('.note__content, .note-content') ?? node)
|
| 507 |
+
: node;
|
| 508 |
+
const content = cleanText(contentNode.textContent);
|
| 509 |
+
|
| 510 |
+
if (content) output.push(`<n>${content}</n>\n\n`);
|
| 511 |
return;
|
| 512 |
}
|
| 513 |
|
|
|
|
| 560 |
console.log('> Starting Astro preview…');
|
| 561 |
// Capture stdout to detect the actual port used
|
| 562 |
let capturedPort = 8080;
|
| 563 |
+
let sawPreviewUrl = false;
|
| 564 |
+
|
| 565 |
+
const maybeCapturePort = (output) => {
|
| 566 |
+
const match = output.match(/http:\/\/localhost:(\d+)\//);
|
| 567 |
+
if (match) {
|
| 568 |
+
capturedPort = parseInt(match[1]);
|
| 569 |
+
sawPreviewUrl = true;
|
| 570 |
+
}
|
| 571 |
+
};
|
| 572 |
+
|
| 573 |
+
const previewPortEnv = process.env.PREVIEW_PORT ? Number(process.env.PREVIEW_PORT) : null;
|
| 574 |
+
if (previewPortEnv) {
|
| 575 |
+
capturedPort = previewPortEnv;
|
| 576 |
+
sawPreviewUrl = true;
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
const preview = spawn('npm', ['run', 'preview'], {
|
| 580 |
cwd,
|
| 581 |
stdio: ['ignore', 'pipe', 'pipe'],
|
|
|
|
| 586 |
preview.stdout.on('data', (data) => {
|
| 587 |
const output = data.toString();
|
| 588 |
process.stdout.write(output);
|
| 589 |
+
maybeCapturePort(output);
|
|
|
|
|
|
|
|
|
|
| 590 |
});
|
| 591 |
|
| 592 |
preview.stderr.on('data', (data) => {
|
| 593 |
+
const output = data.toString();
|
| 594 |
+
process.stderr.write(output);
|
| 595 |
+
maybeCapturePort(output);
|
| 596 |
});
|
| 597 |
|
| 598 |
const previewExit = new Promise((resolvePreview) => {
|
| 599 |
preview.on('close', (code, signal) => resolvePreview({ code, signal }));
|
| 600 |
});
|
| 601 |
|
| 602 |
+
const getBaseUrl = () => {
|
| 603 |
+
if (!sawPreviewUrl) return null;
|
| 604 |
+
return `http://localhost:${capturedPort}/`;
|
| 605 |
+
};
|
| 606 |
|
| 607 |
try {
|
| 608 |
+
await waitForServer(getBaseUrl, 60000);
|
| 609 |
+
const baseUrl = getBaseUrl();
|
| 610 |
+
console.log(`> Server ready (${baseUrl}), extracting content…`);
|
| 611 |
|
| 612 |
const browser = await chromium.launch({ headless: true });
|
| 613 |
try {
|
|
|
|
| 638 |
|
| 639 |
console.log('> Extracting article content…');
|
| 640 |
const txtContent = await extractArticleContent(page);
|
| 641 |
+
const shouldWrapCode = parseBoolean(args['wrap-code'], true);
|
| 642 |
+
const codeWidth = Number(args['code-width'] ?? 80);
|
| 643 |
+
const finalTxtContent = shouldWrapCode ? wrapCodeBlocksInTxt(txtContent, codeWidth) : txtContent;
|
| 644 |
|
| 645 |
// Write output
|
| 646 |
const outPath = resolve(cwd, 'dist', `${outFileBase}.txt`);
|
| 647 |
+
await fs.writeFile(outPath, finalTxtContent, 'utf-8');
|
| 648 |
console.log(`✅ TXT exported: ${outPath}`);
|
| 649 |
|
| 650 |
// Copy to public folder
|
app/scripts/screenshot-elements.mjs
CHANGED
|
@@ -1,11 +1,15 @@
|
|
| 1 |
import { chromium } from 'playwright';
|
| 2 |
import { mkdir } from 'fs/promises';
|
| 3 |
-
import { join } from 'path';
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
const URL = 'http://localhost:4321/?viz=true';
|
| 6 |
const OUTPUT_DIR = './screenshots';
|
| 7 |
const DEVICE_SCALE_FACTOR = 4; // 4x for high-quality print
|
| 8 |
const BASE_VIEWPORT = { width: 1200, height: 800 };
|
|
|
|
| 9 |
|
| 10 |
const slugify = (value) =>
|
| 11 |
String(value || '')
|
|
@@ -14,9 +18,107 @@ const slugify = (value) =>
|
|
| 14 |
.replace(/[^a-z0-9]+/g, '-')
|
| 15 |
.replace(/^-+|-+$/g, '');
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
async function main() {
|
| 18 |
await mkdir(OUTPUT_DIR, { recursive: true });
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
console.log('🚀 Launching browser...');
|
| 21 |
const browser = await chromium.launch({ headless: true });
|
| 22 |
const context = await browser.newContext({
|
|
@@ -25,8 +127,15 @@ async function main() {
|
|
| 25 |
});
|
| 26 |
const page = await context.newPage();
|
| 27 |
|
| 28 |
-
console.log(`📄 Navigating to ${
|
| 29 |
-
await page.goto(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
await page.waitForTimeout(3000);
|
| 31 |
|
| 32 |
let totalCount = 0;
|
|
@@ -47,12 +156,10 @@ async function main() {
|
|
| 47 |
return 'unknown';
|
| 48 |
});
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
continue;
|
| 55 |
-
}
|
| 56 |
}
|
| 57 |
|
| 58 |
const label = await element.evaluate((el) => {
|
|
@@ -205,7 +312,8 @@ async function main() {
|
|
| 205 |
|
| 206 |
await page.locator(cloneSelector).screenshot({
|
| 207 |
path: filepath,
|
| 208 |
-
type: 'png'
|
|
|
|
| 209 |
});
|
| 210 |
|
| 211 |
await page.evaluate((selector) => {
|
|
@@ -262,7 +370,8 @@ async function main() {
|
|
| 262 |
|
| 263 |
await page.locator(cloneSelector).screenshot({
|
| 264 |
path: filepath,
|
| 265 |
-
type: 'png'
|
|
|
|
| 266 |
});
|
| 267 |
|
| 268 |
await page.evaluate((selector) => {
|
|
@@ -272,7 +381,8 @@ async function main() {
|
|
| 272 |
} else {
|
| 273 |
await element.screenshot({
|
| 274 |
path: filepath,
|
| 275 |
-
type: 'png'
|
|
|
|
| 276 |
});
|
| 277 |
}
|
| 278 |
|
|
@@ -308,7 +418,7 @@ async function main() {
|
|
| 308 |
});
|
| 309 |
|
| 310 |
await page.waitForTimeout(150);
|
| 311 |
-
await element.screenshot({ path: openFilepath, type: 'png' });
|
| 312 |
console.log(` ✅ ${openFilename}`);
|
| 313 |
|
| 314 |
await selectHandle.evaluate((el) => {
|
|
@@ -359,6 +469,12 @@ async function main() {
|
|
| 359 |
|
| 360 |
await browser.close();
|
| 361 |
console.log(`\n🎉 Done! Captured ${totalCount} screenshots in ${OUTPUT_DIR}/`);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
}
|
| 363 |
|
| 364 |
main().catch(console.error);
|
|
|
|
| 1 |
import { chromium } from 'playwright';
|
| 2 |
import { mkdir } from 'fs/promises';
|
| 3 |
+
import { join, resolve } from 'path';
|
| 4 |
+
import { spawn } from 'node:child_process';
|
| 5 |
+
import { setTimeout as delay } from 'node:timers/promises';
|
| 6 |
+
import { promises as fs } from 'node:fs';
|
| 7 |
+
import net from 'node:net';
|
| 8 |
|
|
|
|
| 9 |
const OUTPUT_DIR = './screenshots';
|
| 10 |
const DEVICE_SCALE_FACTOR = 4; // 4x for high-quality print
|
| 11 |
const BASE_VIEWPORT = { width: 1200, height: 800 };
|
| 12 |
+
const SCREENSHOT_TIMEOUT_MS = Number(process.env.SCREENSHOT_TIMEOUT_MS || 15000);
|
| 13 |
|
| 14 |
const slugify = (value) =>
|
| 15 |
String(value || '')
|
|
|
|
| 18 |
.replace(/[^a-z0-9]+/g, '-')
|
| 19 |
.replace(/^-+|-+$/g, '');
|
| 20 |
|
| 21 |
+
async function run(command, args = [], options = {}) {
|
| 22 |
+
return new Promise((resolvePromise, reject) => {
|
| 23 |
+
const child = spawn(command, args, { stdio: 'inherit', shell: false, ...options });
|
| 24 |
+
child.on('error', reject);
|
| 25 |
+
child.on('exit', (code) => {
|
| 26 |
+
if (code === 0) resolvePromise(undefined);
|
| 27 |
+
else reject(new Error(`${command} ${args.join(' ')} exited with code ${code}`));
|
| 28 |
+
});
|
| 29 |
+
});
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
async function waitForServer(url, timeoutMs = 60000) {
|
| 33 |
+
const start = Date.now();
|
| 34 |
+
while (Date.now() - start < timeoutMs) {
|
| 35 |
+
try {
|
| 36 |
+
const res = await fetch(url);
|
| 37 |
+
if (res.ok) return;
|
| 38 |
+
} catch { }
|
| 39 |
+
await delay(500);
|
| 40 |
+
}
|
| 41 |
+
throw new Error(`Server did not start in time: ${url}`);
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
async function getFreePort(preferredPort) {
|
| 45 |
+
const tryListen = (port) =>
|
| 46 |
+
new Promise((resolvePromise, reject) => {
|
| 47 |
+
const server = net.createServer();
|
| 48 |
+
server.unref();
|
| 49 |
+
server.on('error', reject);
|
| 50 |
+
server.listen(port, () => {
|
| 51 |
+
const addr = server.address();
|
| 52 |
+
const resolvedPort = typeof addr === 'object' && addr ? addr.port : port;
|
| 53 |
+
server.close(() => resolvePromise(resolvedPort));
|
| 54 |
+
});
|
| 55 |
+
});
|
| 56 |
+
|
| 57 |
+
if (typeof preferredPort === 'number' && Number.isFinite(preferredPort) && preferredPort > 0) {
|
| 58 |
+
try {
|
| 59 |
+
return await tryListen(preferredPort);
|
| 60 |
+
} catch { }
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
return await tryListen(0);
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
async function main() {
|
| 67 |
await mkdir(OUTPUT_DIR, { recursive: true });
|
| 68 |
|
| 69 |
+
const cwd = process.cwd();
|
| 70 |
+
const distDir = resolve(cwd, 'dist');
|
| 71 |
+
let hasDist = false;
|
| 72 |
+
try {
|
| 73 |
+
const st = await fs.stat(distDir);
|
| 74 |
+
hasDist = st && st.isDirectory();
|
| 75 |
+
} catch { }
|
| 76 |
+
if (!hasDist) {
|
| 77 |
+
console.log('> Building Astro site…');
|
| 78 |
+
await run('npm', ['run', 'build'], { cwd });
|
| 79 |
+
} else {
|
| 80 |
+
console.log('> Skipping build (dist/ exists)…');
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
const startPreview =
|
| 84 |
+
!process.env.SCREENSHOT_BASE_URL &&
|
| 85 |
+
String(process.env.SCREENSHOT_START_PREVIEW || 'true').toLowerCase() !== 'false';
|
| 86 |
+
|
| 87 |
+
const preferredPort = process.env.PREVIEW_PORT ? Number(process.env.PREVIEW_PORT) : undefined;
|
| 88 |
+
const previewPort = startPreview ? await getFreePort(preferredPort) : undefined;
|
| 89 |
+
const baseUrl = process.env.SCREENSHOT_BASE_URL || `http://localhost:${previewPort}/`;
|
| 90 |
+
const url = `${baseUrl.replace(/\/?$/, '/')}` + '?viz=true';
|
| 91 |
+
|
| 92 |
+
let preview = null;
|
| 93 |
+
if (startPreview) {
|
| 94 |
+
console.log(`> Starting Astro preview (port ${previewPort})…`);
|
| 95 |
+
const astroBin = resolve(
|
| 96 |
+
cwd,
|
| 97 |
+
'node_modules',
|
| 98 |
+
'.bin',
|
| 99 |
+
process.platform === 'win32' ? 'astro.cmd' : 'astro',
|
| 100 |
+
);
|
| 101 |
+
preview = spawn(astroBin, ['preview', '--host', '--port', String(previewPort)], {
|
| 102 |
+
cwd,
|
| 103 |
+
stdio: 'inherit',
|
| 104 |
+
detached: true,
|
| 105 |
+
});
|
| 106 |
+
} else {
|
| 107 |
+
console.log(`> Using existing server: ${baseUrl}`);
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
try {
|
| 111 |
+
await waitForServer(baseUrl, 60000);
|
| 112 |
+
} catch (err) {
|
| 113 |
+
if (preview) {
|
| 114 |
+
try {
|
| 115 |
+
// Ensure we don't leave the preview process behind if startup failed
|
| 116 |
+
process.kill(-preview.pid, 'SIGTERM');
|
| 117 |
+
} catch { }
|
| 118 |
+
}
|
| 119 |
+
throw err;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
console.log('🚀 Launching browser...');
|
| 123 |
const browser = await chromium.launch({ headless: true });
|
| 124 |
const context = await browser.newContext({
|
|
|
|
| 127 |
});
|
| 128 |
const page = await context.newPage();
|
| 129 |
|
| 130 |
+
console.log(`📄 Navigating to ${url}...`);
|
| 131 |
+
const resp = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
| 132 |
+
if (!resp || !resp.ok()) {
|
| 133 |
+
const status = resp ? `${resp.status()} ${resp.statusText()}` : 'NO_RESPONSE';
|
| 134 |
+
throw new Error(
|
| 135 |
+
`Failed to load ${url} (${status}). ` +
|
| 136 |
+
`If you are running the site elsewhere, set SCREENSHOT_BASE_URL (e.g. http://localhost:4322/).`,
|
| 137 |
+
);
|
| 138 |
+
}
|
| 139 |
await page.waitForTimeout(3000);
|
| 140 |
|
| 141 |
let totalCount = 0;
|
|
|
|
| 156 |
return 'unknown';
|
| 157 |
});
|
| 158 |
|
| 159 |
+
const isVisible = await element.isVisible();
|
| 160 |
+
if (!isVisible) {
|
| 161 |
+
console.log(` ⏭️ Skipping hidden ${type} ${i + 1}`);
|
| 162 |
+
continue;
|
|
|
|
|
|
|
| 163 |
}
|
| 164 |
|
| 165 |
const label = await element.evaluate((el) => {
|
|
|
|
| 312 |
|
| 313 |
await page.locator(cloneSelector).screenshot({
|
| 314 |
path: filepath,
|
| 315 |
+
type: 'png',
|
| 316 |
+
timeout: SCREENSHOT_TIMEOUT_MS
|
| 317 |
});
|
| 318 |
|
| 319 |
await page.evaluate((selector) => {
|
|
|
|
| 370 |
|
| 371 |
await page.locator(cloneSelector).screenshot({
|
| 372 |
path: filepath,
|
| 373 |
+
type: 'png',
|
| 374 |
+
timeout: SCREENSHOT_TIMEOUT_MS
|
| 375 |
});
|
| 376 |
|
| 377 |
await page.evaluate((selector) => {
|
|
|
|
| 381 |
} else {
|
| 382 |
await element.screenshot({
|
| 383 |
path: filepath,
|
| 384 |
+
type: 'png',
|
| 385 |
+
timeout: SCREENSHOT_TIMEOUT_MS
|
| 386 |
});
|
| 387 |
}
|
| 388 |
|
|
|
|
| 418 |
});
|
| 419 |
|
| 420 |
await page.waitForTimeout(150);
|
| 421 |
+
await element.screenshot({ path: openFilepath, type: 'png', timeout: SCREENSHOT_TIMEOUT_MS });
|
| 422 |
console.log(` ✅ ${openFilename}`);
|
| 423 |
|
| 424 |
await selectHandle.evaluate((el) => {
|
|
|
|
| 469 |
|
| 470 |
await browser.close();
|
| 471 |
console.log(`\n🎉 Done! Captured ${totalCount} screenshots in ${OUTPUT_DIR}/`);
|
| 472 |
+
|
| 473 |
+
if (preview) {
|
| 474 |
+
try {
|
| 475 |
+
process.kill(-preview.pid, 'SIGTERM');
|
| 476 |
+
} catch { }
|
| 477 |
+
}
|
| 478 |
}
|
| 479 |
|
| 480 |
main().catch(console.error);
|