Spaces:
Sleeping
Sleeping
| import { JSDOM } from 'jsdom'; | |
| /** | |
| * Escape a string for use in a CSS selector | |
| */ | |
| function cssEscape( str ) { | |
| return str.replace( /([^\w-])/g, '\\$1' ); | |
| } | |
| /** | |
| * Parse Wikipedia Parsoid HTML into chunks for embedding. | |
| * Parsoid output uses <section> tags with data-mw-section-id attributes. | |
| * | |
| * @param {string} html - The parsed HTML from Wikipedia Action API (with useparsoid) | |
| * @param {Array} sections - Section metadata from the API (optional) | |
| */ | |
| export function chunkArticle( html, sections = [] ) { | |
| const dom = new JSDOM( html ); | |
| const doc = dom.window.document; | |
| const chunks = []; | |
| // Build a map of section IDs to titles from the API metadata | |
| const sectionMap = new Map(); | |
| sections.forEach( ( s ) => { | |
| if ( s.anchor ) { | |
| sectionMap.set( s.anchor, s.line ); | |
| } | |
| } ); | |
| // Parsoid REST API puts mw-parser-output class on body, Action API wraps in a div | |
| const container = doc.querySelector( '.mw-parser-output' ) || doc.body; | |
| if ( !container ) { | |
| return chunks; | |
| } | |
| let chunkIndex = 0; | |
| // Find all sections (Parsoid wraps content in <section> tags) | |
| const sectionElements = container.querySelectorAll( 'section' ); | |
| if ( sectionElements.length > 0 ) { | |
| // Parsoid structure: iterate through sections | |
| sectionElements.forEach( ( section ) => { | |
| const sectionId = section.getAttribute( 'data-mw-section-id' ); | |
| const heading = section.querySelector( ':scope > h2, :scope > h3, :scope > h4' ); | |
| let sectionTitle = 'Introduction'; | |
| let sectionAnchor = null; | |
| if ( heading ) { | |
| sectionAnchor = heading.id; | |
| sectionTitle = sectionMap.get( sectionAnchor ) || heading.textContent.trim(); | |
| } | |
| // Get paragraphs directly in this section | |
| const paragraphs = section.querySelectorAll( ':scope > p' ); | |
| paragraphs.forEach( ( p, pIdx ) => { | |
| const text = p.textContent.trim(); | |
| // Skip empty or very short paragraphs | |
| if ( text.length < 50 ) { | |
| return; | |
| } | |
| // Build selector using section's data-mw-section-id | |
| let selector; | |
| if ( sectionId !== null ) { | |
| selector = `section[data-mw-section-id="${ sectionId }"] > p:nth-of-type(${ pIdx + 1 })`; | |
| } else if ( section.id ) { | |
| selector = `#${ cssEscape( section.id ) } > p:nth-of-type(${ pIdx + 1 })`; | |
| } else { | |
| // Fallback for sections without ID | |
| selector = `section:first-of-type > p:nth-of-type(${ pIdx + 1 })`; | |
| } | |
| chunks.push( { | |
| id: `chunk-${ String( chunkIndex ).padStart( 3, '0' ) }`, | |
| text: text, | |
| sectionId: sectionAnchor, | |
| sectionTitle: sectionTitle, | |
| selector: selector, | |
| embedding: null | |
| } ); | |
| chunkIndex++; | |
| } ); | |
| } ); | |
| } else { | |
| // Fallback: no sections, use flat paragraph structure | |
| const paragraphs = container.querySelectorAll( ':scope > p' ); | |
| paragraphs.forEach( ( p, pIdx ) => { | |
| const text = p.textContent.trim(); | |
| if ( text.length < 50 ) { | |
| return; | |
| } | |
| chunks.push( { | |
| id: `chunk-${ String( chunkIndex ).padStart( 3, '0' ) }`, | |
| text: text, | |
| sectionId: null, | |
| sectionTitle: 'Introduction', | |
| selector: `.mw-parser-output > p:nth-of-type(${ pIdx + 1 })`, | |
| embedding: null | |
| } ); | |
| chunkIndex++; | |
| } ); | |
| } | |
| return chunks; | |
| } | |