const USER_AGENT = 'WikipediaQuestionExplorer/1.0 (prototype)'; async function fetchJSON( url ) { const response = await fetch( url, { headers: { 'User-Agent': USER_AGENT } } ); if ( !response.ok ) { throw new Error( `HTTP ${ response.status }: ${ response.statusText }` ); } return response.json(); } async function fetchHTML( url ) { const response = await fetch( url, { headers: { 'User-Agent': USER_AGENT } } ); if ( !response.ok ) { throw new Error( `HTTP ${ response.status }: ${ response.statusText }` ); } return response.text(); } /** * Fetch article metadata including revision ID using the Action API */ export async function getArticleMetadata( title ) { const params = new URLSearchParams( { action: 'query', format: 'json', titles: title, prop: 'revisions', rvprop: 'ids', redirects: '1' } ); const url = `https://en.wikipedia.org/w/api.php?${ params }`; try { const data = await fetchJSON( url ); const pages = data.query?.pages; if ( !pages ) { return null; } const page = Object.values( pages )[ 0 ]; if ( page.missing !== undefined ) { return null; } return { title: page.title, revisionId: page.revisions?.[ 0 ]?.revid }; } catch ( error ) { if ( error.message.includes( '404' ) ) { return null; } throw error; } } /** * Fetch parsed HTML for an article using the REST API (Parsoid) * Returns HTML with proper section structure for chunking */ export async function getArticleHtml( title ) { const encodedTitle = encodeURIComponent( title.replace( / /g, '_' ) ); const url = `https://en.wikipedia.org/api/rest_v1/page/html/${ encodedTitle }`; try { let html = await fetchHTML( url ); // Remove the tag to prevent it from affecting URL resolution in the client html = html.replace( /]*>/gi, '' ); // Convert relative links to absolute Wikipedia URLs html = html.replace( /href="\.\/([^"]+)"/gi, 'href="https://en.wikipedia.org/wiki/$1"' ); // Convert protocol-relative URLs to https html = html.replace( /src="\/\//gi, 'src="https://' ); html = html.replace( /href="\/\//gi, 'href="https://' ); html = html.replace( /srcset="([^"]*)"/gi, ( _, srcset ) => { const fixed = srcset .replace( /^\/\//g, 'https://' ) .replace( /(\s)\/\//g, '$1https://' ); return `srcset="${ fixed }"`; } ); // Add lazy loading to images html = html.replace( /([^<]+)<\/title>/ ); const extractedTitle = titleMatch ? titleMatch[ 1 ] : title; return { html, title: extractedTitle, displayTitle: extractedTitle, sections: [] // REST API doesn't return sections separately, but they're in the HTML }; } catch ( error ) { if ( error.message.includes( '404' ) ) { return null; } throw error; } } /** * Search Wikipedia articles */ export async function searchArticles( query, limit = 10 ) { if ( !query || query.trim() === '' ) { return []; } const url = `https://en.wikipedia.org/w/api.php?action=query&format=json&generator=prefixsearch&gpssearch=${ encodeURIComponent( query ) }&gpsnamespace=0&gpslimit=${ limit }&prop=description`; const response = await fetchJSON( url ); if ( !response.query || !response.query.pages ) { return []; } return Object.values( response.query.pages ) .sort( ( a, b ) => a.index - b.index ) .map( ( page ) => ( { title: page.title, description: page.description || '' } ) ); }