Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Fix all figures get exported
Browse files- app/scripts/export-txt.mjs +54 -5
app/scripts/export-txt.mjs
CHANGED
|
@@ -248,8 +248,12 @@ async function extractArticleContent(page) {
|
|
| 248 |
const main = document.querySelector('main');
|
| 249 |
if (!main) return 'Error: main element not found';
|
| 250 |
|
| 251 |
-
// Helper: get all visual elements in DOM order (
|
| 252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
const elementIndexMap = new Map();
|
| 254 |
|
| 255 |
// Pre-process: assign global indices to visual elements
|
|
@@ -260,7 +264,41 @@ async function extractArticleContent(page) {
|
|
| 260 |
// Walk through all child nodes
|
| 261 |
const processNode = (node) => {
|
| 262 |
const tag = node.tagName?.toLowerCase();
|
| 263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
// Headings
|
| 265 |
if (/^h[1-6]$/.test(tag)) {
|
| 266 |
const level = parseInt(tag[1]);
|
|
@@ -383,9 +421,20 @@ async function extractArticleContent(page) {
|
|
| 383 |
|
| 384 |
// Figures (images, embeds)
|
| 385 |
if (tag === 'figure') {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
const img = node.querySelector('img');
|
| 387 |
-
const htmlEmbed =
|
| 388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
const caption = node.querySelector('figcaption');
|
| 390 |
|
| 391 |
// Skip if it's not really a figure (no img, no embed, no caption)
|
|
|
|
| 248 |
const main = document.querySelector('main');
|
| 249 |
if (!main) return 'Error: main element not found';
|
| 250 |
|
| 251 |
+
// Helper: get all visual elements in DOM order (match screenshot-elements.mjs)
|
| 252 |
+
// NOTE: Don't include generic `figure` here. The screenshot script indexes only
|
| 253 |
+
// `.html-embed`, `.table-scroll > table`, `.image-wrapper`, and `.katex-display`.
|
| 254 |
+
const allVisualElements = Array.from(
|
| 255 |
+
document.querySelectorAll('.html-embed, .table-scroll > table, .image-wrapper, .katex-display'),
|
| 256 |
+
);
|
| 257 |
const elementIndexMap = new Map();
|
| 258 |
|
| 259 |
// Pre-process: assign global indices to visual elements
|
|
|
|
| 264 |
// Walk through all child nodes
|
| 265 |
const processNode = (node) => {
|
| 266 |
const tag = node.tagName?.toLowerCase();
|
| 267 |
+
|
| 268 |
+
// Images (Image.astro renders a `.image-wrapper` container; it may or may not contain a <figure>)
|
| 269 |
+
if (node.classList?.contains('image-wrapper')) {
|
| 270 |
+
const globalIndex = elementIndexMap.get(node);
|
| 271 |
+
if (!globalIndex) return;
|
| 272 |
+
|
| 273 |
+
const img = node.querySelector('img');
|
| 274 |
+
const figure = node.querySelector('figure');
|
| 275 |
+
const caption = figure?.querySelector('figcaption') || node.querySelector('figcaption');
|
| 276 |
+
|
| 277 |
+
let name = '';
|
| 278 |
+
let anchor = '';
|
| 279 |
+
let description = '';
|
| 280 |
+
|
| 281 |
+
// Prefer an explicit figure ID (used for cross-references), otherwise wrapper ID.
|
| 282 |
+
if (figure?.id) anchor = figure.id;
|
| 283 |
+
else if (node.id) anchor = node.id;
|
| 284 |
+
|
| 285 |
+
if (caption) {
|
| 286 |
+
const captionText = stripHtml(caption.innerHTML);
|
| 287 |
+
const parsed = parseCaptionText(captionText, 'Figure');
|
| 288 |
+
name = parsed.name;
|
| 289 |
+
description = parsed.description;
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
if (!description && img?.alt) description = img.alt;
|
| 293 |
+
if (!name) name = `image-${globalIndex}`;
|
| 294 |
+
|
| 295 |
+
const parts = [name];
|
| 296 |
+
if (anchor) parts.push(anchor);
|
| 297 |
+
if (description) parts.push(description);
|
| 298 |
+
output.push(`<f>${parts.join(' | ')}</f>\n\n`);
|
| 299 |
+
return;
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
// Headings
|
| 303 |
if (/^h[1-6]$/.test(tag)) {
|
| 304 |
const level = parseInt(tag[1]);
|
|
|
|
| 421 |
|
| 422 |
// Figures (images, embeds)
|
| 423 |
if (tag === 'figure') {
|
| 424 |
+
// If this <figure> is inside an `.image-wrapper`, the wrapper handler above will emit
|
| 425 |
+
// a single <f> tag for the whole image. Avoid double-emitting.
|
| 426 |
+
if (node.closest?.('.image-wrapper')) {
|
| 427 |
+
const isHtmlEmbedFigure = node.matches?.('.html-embed, .html-embed--screenshot');
|
| 428 |
+
if (!isHtmlEmbedFigure) return;
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
const img = node.querySelector('img');
|
| 432 |
+
const htmlEmbed =
|
| 433 |
+
node.matches?.('.html-embed, .html-embed--screenshot')
|
| 434 |
+
? node
|
| 435 |
+
: node.querySelector('.html-embed, .html-embed--screenshot');
|
| 436 |
+
// Images are wrapped in a sibling/parent `.image-wrapper` container in this codebase
|
| 437 |
+
const imageWrapper = node.closest?.('.image-wrapper') || null;
|
| 438 |
const caption = node.querySelector('figcaption');
|
| 439 |
|
| 440 |
// Skip if it's not really a figure (no img, no embed, no caption)
|