import { JSDOM } from 'jsdom'; /** * Escape a string for use in a CSS selector */ function cssEscape( str ) { return str.replace( /([^\w-])/g, '\\$1' ); } /** * Parse Wikipedia Parsoid HTML into chunks for embedding. * Parsoid output uses
tags with data-mw-section-id attributes. * * @param {string} html - The parsed HTML from Wikipedia Action API (with useparsoid) * @param {Array} sections - Section metadata from the API (optional) */ export function chunkArticle( html, sections = [] ) { const dom = new JSDOM( html ); const doc = dom.window.document; const chunks = []; // Build a map of section IDs to titles from the API metadata const sectionMap = new Map(); sections.forEach( ( s ) => { if ( s.anchor ) { sectionMap.set( s.anchor, s.line ); } } ); // Parsoid REST API puts mw-parser-output class on body, Action API wraps in a div const container = doc.querySelector( '.mw-parser-output' ) || doc.body; if ( !container ) { return chunks; } let chunkIndex = 0; // Find all sections (Parsoid wraps content in
tags) const sectionElements = container.querySelectorAll( 'section' ); if ( sectionElements.length > 0 ) { // Parsoid structure: iterate through sections sectionElements.forEach( ( section ) => { const sectionId = section.getAttribute( 'data-mw-section-id' ); const heading = section.querySelector( ':scope > h2, :scope > h3, :scope > h4' ); let sectionTitle = 'Introduction'; let sectionAnchor = null; if ( heading ) { sectionAnchor = heading.id; sectionTitle = sectionMap.get( sectionAnchor ) || heading.textContent.trim(); } // Get paragraphs directly in this section const paragraphs = section.querySelectorAll( ':scope > p' ); paragraphs.forEach( ( p, pIdx ) => { const text = p.textContent.trim(); // Skip empty or very short paragraphs if ( text.length < 50 ) { return; } // Build selector using section's data-mw-section-id let selector; if ( sectionId !== null ) { selector = `section[data-mw-section-id="${ sectionId }"] > p:nth-of-type(${ pIdx + 1 })`; } else if ( section.id ) { selector = `#${ cssEscape( section.id ) } > p:nth-of-type(${ pIdx + 1 })`; } else { // Fallback for sections without ID selector = `section:first-of-type > p:nth-of-type(${ pIdx + 1 })`; } chunks.push( { id: `chunk-${ String( chunkIndex ).padStart( 3, '0' ) }`, text: text, sectionId: sectionAnchor, sectionTitle: sectionTitle, selector: selector, embedding: null } ); chunkIndex++; } ); } ); } else { // Fallback: no sections, use flat paragraph structure const paragraphs = container.querySelectorAll( ':scope > p' ); paragraphs.forEach( ( p, pIdx ) => { const text = p.textContent.trim(); if ( text.length < 50 ) { return; } chunks.push( { id: `chunk-${ String( chunkIndex ).padStart( 3, '0' ) }`, text: text, sectionId: null, sectionTitle: 'Introduction', selector: `.mw-parser-output > p:nth-of-type(${ pIdx + 1 })`, embedding: null } ); chunkIndex++; } ); } return chunks; }