Eric Gardner
Use Wikipedia REST API for article content
7ecba03
const USER_AGENT = 'WikipediaQuestionExplorer/1.0 (prototype)';
async function fetchJSON( url ) {
const response = await fetch( url, {
headers: { 'User-Agent': USER_AGENT }
} );
if ( !response.ok ) {
throw new Error( `HTTP ${ response.status }: ${ response.statusText }` );
}
return response.json();
}
async function fetchHTML( url ) {
const response = await fetch( url, {
headers: { 'User-Agent': USER_AGENT }
} );
if ( !response.ok ) {
throw new Error( `HTTP ${ response.status }: ${ response.statusText }` );
}
return response.text();
}
/**
* Fetch article metadata including revision ID using the Action API
*/
export async function getArticleMetadata( title ) {
const params = new URLSearchParams( {
action: 'query',
format: 'json',
titles: title,
prop: 'revisions',
rvprop: 'ids',
redirects: '1'
} );
const url = `https://en.wikipedia.org/w/api.php?${ params }`;
try {
const data = await fetchJSON( url );
const pages = data.query?.pages;
if ( !pages ) {
return null;
}
const page = Object.values( pages )[ 0 ];
if ( page.missing !== undefined ) {
return null;
}
return {
title: page.title,
revisionId: page.revisions?.[ 0 ]?.revid
};
} catch ( error ) {
if ( error.message.includes( '404' ) ) {
return null;
}
throw error;
}
}
/**
* Fetch parsed HTML for an article using the REST API (Parsoid)
* Returns HTML with proper section structure for chunking
*/
export async function getArticleHtml( title ) {
const encodedTitle = encodeURIComponent( title.replace( / /g, '_' ) );
const url = `https://en.wikipedia.org/api/rest_v1/page/html/${ encodedTitle }`;
try {
let html = await fetchHTML( url );
// Remove the <base> tag to prevent it from affecting URL resolution in the client
html = html.replace( /<base[^>]*>/gi, '' );
// Convert relative links to absolute Wikipedia URLs
html = html.replace( /href="\.\/([^"]+)"/gi, 'href="https://en.wikipedia.org/wiki/$1"' );
// Convert protocol-relative URLs to https
html = html.replace( /src="\/\//gi, 'src="https://' );
html = html.replace( /href="\/\//gi, 'href="https://' );
html = html.replace( /srcset="([^"]*)"/gi, ( _, srcset ) => {
const fixed = srcset
.replace( /^\/\//g, 'https://' )
.replace( /(\s)\/\//g, '$1https://' );
return `srcset="${ fixed }"`;
} );
// Add lazy loading to images
html = html.replace( /<img /gi, '<img loading="lazy" ' );
// Extract title from the HTML
const titleMatch = html.match( /<title>([^<]+)<\/title>/ );
const extractedTitle = titleMatch ? titleMatch[ 1 ] : title;
return {
html,
title: extractedTitle,
displayTitle: extractedTitle,
sections: [] // REST API doesn't return sections separately, but they're in the HTML
};
} catch ( error ) {
if ( error.message.includes( '404' ) ) {
return null;
}
throw error;
}
}
/**
* Search Wikipedia articles
*/
export async function searchArticles( query, limit = 10 ) {
if ( !query || query.trim() === '' ) {
return [];
}
const url = `https://en.wikipedia.org/w/api.php?action=query&format=json&generator=prefixsearch&gpssearch=${ encodeURIComponent( query ) }&gpsnamespace=0&gpslimit=${ limit }&prop=description`;
const response = await fetchJSON( url );
if ( !response.query || !response.query.pages ) {
return [];
}
return Object.values( response.query.pages )
.sort( ( a, b ) => a.index - b.index )
.map( ( page ) => ( {
title: page.title,
description: page.description || ''
} ) );
}