Spaces:

egardner
/

question-explorer-api

Sleeping

File size: 3,524 Bytes

cdc50ff
 
 
 
 
 
 
 
 
 
 
 
 
 
7ecba03
 
 
 
 
 
 
 
 
 
 
 
cdc50ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ecba03
 
cdc50ff
 
7ecba03
 
cdc50ff
 
7ecba03
cdc50ff
7ecba03
 
cdc50ff
 
 
 
7ecba03
cdc50ff
7ecba03
cdc50ff
 
 
 
 
 
 
 
 
 
7ecba03
 
 
cdc50ff
 
 
7ecba03
 
 
cdc50ff

const USER_AGENT = 'WikipediaQuestionExplorer/1.0 (prototype)';

async function fetchJSON( url ) {
	const response = await fetch( url, {
		headers: { 'User-Agent': USER_AGENT }
	} );

	if ( !response.ok ) {
		throw new Error( `HTTP ${ response.status }: ${ response.statusText }` );
	}

	return response.json();
}

async function fetchHTML( url ) {
	const response = await fetch( url, {
		headers: { 'User-Agent': USER_AGENT }
	} );

	if ( !response.ok ) {
		throw new Error( `HTTP ${ response.status }: ${ response.statusText }` );
	}

	return response.text();
}

/**
 * Fetch article metadata including revision ID using the Action API
 */
export async function getArticleMetadata( title ) {
	const params = new URLSearchParams( {
		action: 'query',
		format: 'json',
		titles: title,
		prop: 'revisions',
		rvprop: 'ids',
		redirects: '1'
	} );

	const url = `https://en.wikipedia.org/w/api.php?${ params }`;

	try {
		const data = await fetchJSON( url );
		const pages = data.query?.pages;
		if ( !pages ) {
			return null;
		}

		const page = Object.values( pages )[ 0 ];
		if ( page.missing !== undefined ) {
			return null;
		}

		return {
			title: page.title,
			revisionId: page.revisions?.[ 0 ]?.revid
		};
	} catch ( error ) {
		if ( error.message.includes( '404' ) ) {
			return null;
		}
		throw error;
	}
}

/**
 * Fetch parsed HTML for an article using the REST API (Parsoid)
 * Returns HTML with proper section structure for chunking
 */
export async function getArticleHtml( title ) {
	const encodedTitle = encodeURIComponent( title.replace( / /g, '_' ) );
	const url = `https://en.wikipedia.org/api/rest_v1/page/html/${ encodedTitle }`;

	try {
		let html = await fetchHTML( url );

		// Remove the <base> tag to prevent it from affecting URL resolution in the client
		html = html.replace( /<base[^>]*>/gi, '' );

		// Convert relative links to absolute Wikipedia URLs
		html = html.replace( /href="\.\/([^"]+)"/gi, 'href="https://en.wikipedia.org/wiki/$1"' );

		// Convert protocol-relative URLs to https
		html = html.replace( /src="\/\//gi, 'src="https://' );
		html = html.replace( /href="\/\//gi, 'href="https://' );
		html = html.replace( /srcset="([^"]*)"/gi, ( _, srcset ) => {
			const fixed = srcset
				.replace( /^\/\//g, 'https://' )
				.replace( /(\s)\/\//g, '$1https://' );
			return `srcset="${ fixed }"`;
		} );

		// Add lazy loading to images
		html = html.replace( /<img /gi, '<img loading="lazy" ' );

		// Extract title from the HTML
		const titleMatch = html.match( /<title>([^<]+)<\/title>/ );
		const extractedTitle = titleMatch ? titleMatch[ 1 ] : title;

		return {
			html,
			title: extractedTitle,
			displayTitle: extractedTitle,
			sections: [] // REST API doesn't return sections separately, but they're in the HTML
		};
	} catch ( error ) {
		if ( error.message.includes( '404' ) ) {
			return null;
		}
		throw error;
	}
}

/**
 * Search Wikipedia articles
 */
export async function searchArticles( query, limit = 10 ) {
	if ( !query || query.trim() === '' ) {
		return [];
	}

	const url = `https://en.wikipedia.org/w/api.php?action=query&format=json&generator=prefixsearch&gpssearch=${ encodeURIComponent( query ) }&gpsnamespace=0&gpslimit=${ limit }&prop=description`;

	const response = await fetchJSON( url );

	if ( !response.query || !response.query.pages ) {
		return [];
	}

	return Object.values( response.query.pages )
		.sort( ( a, b ) => a.index - b.index )
		.map( ( page ) => ( {
			title: page.title,
			description: page.description || ''
		} ) );
}