Spaces:

egardner
/

question-explorer-api

Sleeping

App Files Files Community

Eric Gardner commited on Dec 18, 2025

Commit

7ecba03

1 Parent(s): cdc50ff

Use Wikipedia REST API for article content

Browse files

Files changed (3) hide show

index.js +4 -1
services/chunker.js +2 -1
services/wikipedia.js +27 -34

index.js CHANGED Viewed

@@ -7,7 +7,10 @@ import { initEmbedder } from './services/embedder.js';
 const app = express();
 const PORT = process.env.PORT || 3000;
-app.use( cors() );
 app.use( express.json() );
 // Routes

 const app = express();
 const PORT = process.env.PORT || 3000;
+app.use( cors( {
+	origin: true,
+	credentials: true
+} ) );
 app.use( express.json() );
 // Routes

services/chunker.js CHANGED Viewed

@@ -27,7 +27,8 @@ export function chunkArticle( html, sections = [] ) {
 		}
 	} );
-	const container = doc.querySelector( '.mw-parser-output' );
 	if ( !container ) {
 		return chunks;
 	}

 		}
 	} );
+	// Parsoid REST API puts mw-parser-output class on body, Action API wraps in a div
+	const container = doc.querySelector( '.mw-parser-output' ) || doc.body;
 	if ( !container ) {
 		return chunks;
 	}

services/wikipedia.js CHANGED Viewed

@@ -12,6 +12,18 @@ async function fetchJSON( url ) {
 	return response.json();
 }
 /**
  * Fetch article metadata including revision ID using the Action API
  */
@@ -52,46 +64,26 @@ export async function getArticleMetadata( title ) {
 }
 /**
- * Fetch parsed HTML for an article using the Action API with mobile format and Parsoid
- * Returns cleaner HTML optimized for mobile viewing (no infoboxes, cleaner structure)
  */
 export async function getArticleHtml( title ) {
-	const params = new URLSearchParams( {
-		action: 'parse',
-		format: 'json',
-		page: title,
-		prop: 'text|sections|displaytitle',
-		redirects: '1',
-		disableeditsection: '1',
-		disabletoc: '1',
-		mobileformat: '1',
-		useparsoid: '1'
-	} );
-	const url = `https://en.wikipedia.org/w/api.php?${ params }`;
 	try {
-		const data = await fetchJSON( url );
-		if ( data.error ) {
-			if ( data.error.code === 'missingtitle' ) {
-				return null;
-			}
-			throw new Error( data.error.info );
-		}
-		let html = data.parse.text[ '*' ];
 		// Convert relative links to absolute Wikipedia URLs
-		html = html.replace( /href="\/wiki\//gi, 'href="https://en.wikipedia.org/wiki/' );
-		html = html.replace( /href="\/w\//gi, 'href="https://en.wikipedia.org/w/' );
 		html = html.replace( /href="\.\/([^"]+)"/gi, 'href="https://en.wikipedia.org/wiki/$1"' );
-		// Convert protocol-relative URLs to https (only at start of URL, not ://)
 		html = html.replace( /src="\/\//gi, 'src="https://' );
-		// Handle srcset with multiple entries - only replace // at start of URLs, not :// in the middle
 		html = html.replace( /srcset="([^"]*)"/gi, ( _, srcset ) => {
-			// Replace space followed by // (new URL entry) or start of string //
 			const fixed = srcset
 				.replace( /^\/\//g, 'https://' )
 				.replace( /(\s)\/\//g, '$1https://' );
@@ -101,14 +93,15 @@ export async function getArticleHtml( title ) {
 		// Add lazy loading to images
 		html = html.replace( /<img /gi, '<img loading="lazy" ' );
-		// Wrap in a container div with class for styling
-		html = `<div class="mw-parser-output">${ html }</div>`;
 		return {
 			html,
-			title: data.parse.title,
-			displayTitle: data.parse.displaytitle,
-			sections: data.parse.sections
 		};
 	} catch ( error ) {
 		if ( error.message.includes( '404' ) ) {

 	return response.json();
 }
+async function fetchHTML( url ) {
+	const response = await fetch( url, {
+		headers: { 'User-Agent': USER_AGENT }
+	} );
+	if ( !response.ok ) {
+		throw new Error( `HTTP ${ response.status }: ${ response.statusText }` );
+	}
+	return response.text();
+}
 /**
  * Fetch article metadata including revision ID using the Action API
  */
 }
 /**
+ * Fetch parsed HTML for an article using the REST API (Parsoid)
+ * Returns HTML with proper section structure for chunking
  */
 export async function getArticleHtml( title ) {
+	const encodedTitle = encodeURIComponent( title.replace( / /g, '_' ) );
+	const url = `https://en.wikipedia.org/api/rest_v1/page/html/${ encodedTitle }`;
 	try {
+		let html = await fetchHTML( url );
+		// Remove the <base> tag to prevent it from affecting URL resolution in the client
+		html = html.replace( /<base[^>]*>/gi, '' );
 		// Convert relative links to absolute Wikipedia URLs
 		html = html.replace( /href="\.\/([^"]+)"/gi, 'href="https://en.wikipedia.org/wiki/$1"' );
+		// Convert protocol-relative URLs to https
 		html = html.replace( /src="\/\//gi, 'src="https://' );
+		html = html.replace( /href="\/\//gi, 'href="https://' );
 		html = html.replace( /srcset="([^"]*)"/gi, ( _, srcset ) => {
 			const fixed = srcset
 				.replace( /^\/\//g, 'https://' )
 				.replace( /(\s)\/\//g, '$1https://' );
 		// Add lazy loading to images
 		html = html.replace( /<img /gi, '<img loading="lazy" ' );
+		// Extract title from the HTML
+		const titleMatch = html.match( /<title>([^<]+)<\/title>/ );
+		const extractedTitle = titleMatch ? titleMatch[ 1 ] : title;
 		return {
 			html,
+			title: extractedTitle,
+			displayTitle: extractedTitle,
+			sections: [] // REST API doesn't return sections separately, but they're in the HTML
 		};
 	} catch ( error ) {
 		if ( error.message.includes( '404' ) ) {