Eric Gardner
Use Wikipedia REST API for article content
7ecba03
import { JSDOM } from 'jsdom';
/**
* Escape a string for use in a CSS selector
*/
function cssEscape( str ) {
return str.replace( /([^\w-])/g, '\\$1' );
}
/**
* Parse Wikipedia Parsoid HTML into chunks for embedding.
* Parsoid output uses <section> tags with data-mw-section-id attributes.
*
* @param {string} html - The parsed HTML from Wikipedia Action API (with useparsoid)
* @param {Array} sections - Section metadata from the API (optional)
*/
export function chunkArticle( html, sections = [] ) {
const dom = new JSDOM( html );
const doc = dom.window.document;
const chunks = [];
// Build a map of section IDs to titles from the API metadata
const sectionMap = new Map();
sections.forEach( ( s ) => {
if ( s.anchor ) {
sectionMap.set( s.anchor, s.line );
}
} );
// Parsoid REST API puts mw-parser-output class on body, Action API wraps in a div
const container = doc.querySelector( '.mw-parser-output' ) || doc.body;
if ( !container ) {
return chunks;
}
let chunkIndex = 0;
// Find all sections (Parsoid wraps content in <section> tags)
const sectionElements = container.querySelectorAll( 'section' );
if ( sectionElements.length > 0 ) {
// Parsoid structure: iterate through sections
sectionElements.forEach( ( section ) => {
const sectionId = section.getAttribute( 'data-mw-section-id' );
const heading = section.querySelector( ':scope > h2, :scope > h3, :scope > h4' );
let sectionTitle = 'Introduction';
let sectionAnchor = null;
if ( heading ) {
sectionAnchor = heading.id;
sectionTitle = sectionMap.get( sectionAnchor ) || heading.textContent.trim();
}
// Get paragraphs directly in this section
const paragraphs = section.querySelectorAll( ':scope > p' );
paragraphs.forEach( ( p, pIdx ) => {
const text = p.textContent.trim();
// Skip empty or very short paragraphs
if ( text.length < 50 ) {
return;
}
// Build selector using section's data-mw-section-id
let selector;
if ( sectionId !== null ) {
selector = `section[data-mw-section-id="${ sectionId }"] > p:nth-of-type(${ pIdx + 1 })`;
} else if ( section.id ) {
selector = `#${ cssEscape( section.id ) } > p:nth-of-type(${ pIdx + 1 })`;
} else {
// Fallback for sections without ID
selector = `section:first-of-type > p:nth-of-type(${ pIdx + 1 })`;
}
chunks.push( {
id: `chunk-${ String( chunkIndex ).padStart( 3, '0' ) }`,
text: text,
sectionId: sectionAnchor,
sectionTitle: sectionTitle,
selector: selector,
embedding: null
} );
chunkIndex++;
} );
} );
} else {
// Fallback: no sections, use flat paragraph structure
const paragraphs = container.querySelectorAll( ':scope > p' );
paragraphs.forEach( ( p, pIdx ) => {
const text = p.textContent.trim();
if ( text.length < 50 ) {
return;
}
chunks.push( {
id: `chunk-${ String( chunkIndex ).padStart( 3, '0' ) }`,
text: text,
sectionId: null,
sectionTitle: 'Introduction',
selector: `.mw-parser-output > p:nth-of-type(${ pIdx + 1 })`,
embedding: null
} );
chunkIndex++;
} );
}
return chunks;
}