lewtun HF Staff commited on
Commit
a303bea
·
1 Parent(s): 379091a

Fix all figures get exported

Browse files
Files changed (1) hide show
  1. app/scripts/export-txt.mjs +54 -5
app/scripts/export-txt.mjs CHANGED
@@ -248,8 +248,12 @@ async function extractArticleContent(page) {
248
  const main = document.querySelector('main');
249
  if (!main) return 'Error: main element not found';
250
 
251
- // Helper: get all visual elements in DOM order (same as screenshot script)
252
- const allVisualElements = Array.from(main.querySelectorAll('.html-embed, .table-scroll > table, .image-wrapper, figure, .katex-display'));
 
 
 
 
253
  const elementIndexMap = new Map();
254
 
255
  // Pre-process: assign global indices to visual elements
@@ -260,7 +264,41 @@ async function extractArticleContent(page) {
260
  // Walk through all child nodes
261
  const processNode = (node) => {
262
  const tag = node.tagName?.toLowerCase();
263
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  // Headings
265
  if (/^h[1-6]$/.test(tag)) {
266
  const level = parseInt(tag[1]);
@@ -383,9 +421,20 @@ async function extractArticleContent(page) {
383
 
384
  // Figures (images, embeds)
385
  if (tag === 'figure') {
 
 
 
 
 
 
 
386
  const img = node.querySelector('img');
387
- const htmlEmbed = node.querySelector('.html-embed, .html-embed--screenshot');
388
- const imageWrapper = node.querySelector('.image-wrapper');
 
 
 
 
389
  const caption = node.querySelector('figcaption');
390
 
391
  // Skip if it's not really a figure (no img, no embed, no caption)
 
248
  const main = document.querySelector('main');
249
  if (!main) return 'Error: main element not found';
250
 
251
+ // Helper: get all visual elements in DOM order (match screenshot-elements.mjs)
252
+ // NOTE: Don't include generic `figure` here. The screenshot script indexes only
253
+ // `.html-embed`, `.table-scroll > table`, `.image-wrapper`, and `.katex-display`.
254
+ const allVisualElements = Array.from(
255
+ document.querySelectorAll('.html-embed, .table-scroll > table, .image-wrapper, .katex-display'),
256
+ );
257
  const elementIndexMap = new Map();
258
 
259
  // Pre-process: assign global indices to visual elements
 
264
  // Walk through all child nodes
265
  const processNode = (node) => {
266
  const tag = node.tagName?.toLowerCase();
267
+
268
+ // Images (Image.astro renders a `.image-wrapper` container; it may or may not contain a <figure>)
269
+ if (node.classList?.contains('image-wrapper')) {
270
+ const globalIndex = elementIndexMap.get(node);
271
+ if (!globalIndex) return;
272
+
273
+ const img = node.querySelector('img');
274
+ const figure = node.querySelector('figure');
275
+ const caption = figure?.querySelector('figcaption') || node.querySelector('figcaption');
276
+
277
+ let name = '';
278
+ let anchor = '';
279
+ let description = '';
280
+
281
+ // Prefer an explicit figure ID (used for cross-references), otherwise wrapper ID.
282
+ if (figure?.id) anchor = figure.id;
283
+ else if (node.id) anchor = node.id;
284
+
285
+ if (caption) {
286
+ const captionText = stripHtml(caption.innerHTML);
287
+ const parsed = parseCaptionText(captionText, 'Figure');
288
+ name = parsed.name;
289
+ description = parsed.description;
290
+ }
291
+
292
+ if (!description && img?.alt) description = img.alt;
293
+ if (!name) name = `image-${globalIndex}`;
294
+
295
+ const parts = [name];
296
+ if (anchor) parts.push(anchor);
297
+ if (description) parts.push(description);
298
+ output.push(`<f>${parts.join(' | ')}</f>\n\n`);
299
+ return;
300
+ }
301
+
302
  // Headings
303
  if (/^h[1-6]$/.test(tag)) {
304
  const level = parseInt(tag[1]);
 
421
 
422
  // Figures (images, embeds)
423
  if (tag === 'figure') {
424
+ // If this <figure> is inside an `.image-wrapper`, the wrapper handler above will emit
425
+ // a single <f> tag for the whole image. Avoid double-emitting.
426
+ if (node.closest?.('.image-wrapper')) {
427
+ const isHtmlEmbedFigure = node.matches?.('.html-embed, .html-embed--screenshot');
428
+ if (!isHtmlEmbedFigure) return;
429
+ }
430
+
431
  const img = node.querySelector('img');
432
+ const htmlEmbed =
433
+ node.matches?.('.html-embed, .html-embed--screenshot')
434
+ ? node
435
+ : node.querySelector('.html-embed, .html-embed--screenshot');
436
+ // Images are wrapped in a sibling/parent `.image-wrapper` container in this codebase
437
+ const imageWrapper = node.closest?.('.image-wrapper') || null;
438
  const caption = node.querySelector('figcaption');
439
 
440
  // Skip if it's not really a figure (no img, no embed, no caption)