File size: 3,170 Bytes
cdc50ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ecba03
 
cdc50ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import { JSDOM } from 'jsdom';

/**
 * Escape a string for use in a CSS selector
 */
function cssEscape( str ) {
	return str.replace( /([^\w-])/g, '\\$1' );
}

/**
 * Parse Wikipedia Parsoid HTML into chunks for embedding.
 * Parsoid output uses <section> tags with data-mw-section-id attributes.
 *
 * @param {string} html - The parsed HTML from Wikipedia Action API (with useparsoid)
 * @param {Array} sections - Section metadata from the API (optional)
 */
export function chunkArticle( html, sections = [] ) {
	const dom = new JSDOM( html );
	const doc = dom.window.document;
	const chunks = [];

	// Build a map of section IDs to titles from the API metadata
	const sectionMap = new Map();
	sections.forEach( ( s ) => {
		if ( s.anchor ) {
			sectionMap.set( s.anchor, s.line );
		}
	} );

	// Parsoid REST API puts mw-parser-output class on body, Action API wraps in a div
	const container = doc.querySelector( '.mw-parser-output' ) || doc.body;
	if ( !container ) {
		return chunks;
	}

	let chunkIndex = 0;

	// Find all sections (Parsoid wraps content in <section> tags)
	const sectionElements = container.querySelectorAll( 'section' );

	if ( sectionElements.length > 0 ) {
		// Parsoid structure: iterate through sections
		sectionElements.forEach( ( section ) => {
			const sectionId = section.getAttribute( 'data-mw-section-id' );
			const heading = section.querySelector( ':scope > h2, :scope > h3, :scope > h4' );

			let sectionTitle = 'Introduction';
			let sectionAnchor = null;

			if ( heading ) {
				sectionAnchor = heading.id;
				sectionTitle = sectionMap.get( sectionAnchor ) || heading.textContent.trim();
			}

			// Get paragraphs directly in this section
			const paragraphs = section.querySelectorAll( ':scope > p' );

			paragraphs.forEach( ( p, pIdx ) => {
				const text = p.textContent.trim();

				// Skip empty or very short paragraphs
				if ( text.length < 50 ) {
					return;
				}

				// Build selector using section's data-mw-section-id
				let selector;
				if ( sectionId !== null ) {
					selector = `section[data-mw-section-id="${ sectionId }"] > p:nth-of-type(${ pIdx + 1 })`;
				} else if ( section.id ) {
					selector = `#${ cssEscape( section.id ) } > p:nth-of-type(${ pIdx + 1 })`;
				} else {
					// Fallback for sections without ID
					selector = `section:first-of-type > p:nth-of-type(${ pIdx + 1 })`;
				}

				chunks.push( {
					id: `chunk-${ String( chunkIndex ).padStart( 3, '0' ) }`,
					text: text,
					sectionId: sectionAnchor,
					sectionTitle: sectionTitle,
					selector: selector,
					embedding: null
				} );

				chunkIndex++;
			} );
		} );
	} else {
		// Fallback: no sections, use flat paragraph structure
		const paragraphs = container.querySelectorAll( ':scope > p' );

		paragraphs.forEach( ( p, pIdx ) => {
			const text = p.textContent.trim();

			if ( text.length < 50 ) {
				return;
			}

			chunks.push( {
				id: `chunk-${ String( chunkIndex ).padStart( 3, '0' ) }`,
				text: text,
				sectionId: null,
				sectionTitle: 'Introduction',
				selector: `.mw-parser-output > p:nth-of-type(${ pIdx + 1 })`,
				embedding: null
			} );

			chunkIndex++;
		} );
	}

	return chunks;
}