Spaces:
Sleeping
Sleeping
| const USER_AGENT = 'WikipediaQuestionExplorer/1.0 (prototype)'; | |
| async function fetchJSON( url ) { | |
| const response = await fetch( url, { | |
| headers: { 'User-Agent': USER_AGENT } | |
| } ); | |
| if ( !response.ok ) { | |
| throw new Error( `HTTP ${ response.status }: ${ response.statusText }` ); | |
| } | |
| return response.json(); | |
| } | |
| async function fetchHTML( url ) { | |
| const response = await fetch( url, { | |
| headers: { 'User-Agent': USER_AGENT } | |
| } ); | |
| if ( !response.ok ) { | |
| throw new Error( `HTTP ${ response.status }: ${ response.statusText }` ); | |
| } | |
| return response.text(); | |
| } | |
| /** | |
| * Fetch article metadata including revision ID using the Action API | |
| */ | |
| export async function getArticleMetadata( title ) { | |
| const params = new URLSearchParams( { | |
| action: 'query', | |
| format: 'json', | |
| titles: title, | |
| prop: 'revisions', | |
| rvprop: 'ids', | |
| redirects: '1' | |
| } ); | |
| const url = `https://en.wikipedia.org/w/api.php?${ params }`; | |
| try { | |
| const data = await fetchJSON( url ); | |
| const pages = data.query?.pages; | |
| if ( !pages ) { | |
| return null; | |
| } | |
| const page = Object.values( pages )[ 0 ]; | |
| if ( page.missing !== undefined ) { | |
| return null; | |
| } | |
| return { | |
| title: page.title, | |
| revisionId: page.revisions?.[ 0 ]?.revid | |
| }; | |
| } catch ( error ) { | |
| if ( error.message.includes( '404' ) ) { | |
| return null; | |
| } | |
| throw error; | |
| } | |
| } | |
| /** | |
| * Fetch parsed HTML for an article using the REST API (Parsoid) | |
| * Returns HTML with proper section structure for chunking | |
| */ | |
| export async function getArticleHtml( title ) { | |
| const encodedTitle = encodeURIComponent( title.replace( / /g, '_' ) ); | |
| const url = `https://en.wikipedia.org/api/rest_v1/page/html/${ encodedTitle }`; | |
| try { | |
| let html = await fetchHTML( url ); | |
| // Remove the <base> tag to prevent it from affecting URL resolution in the client | |
| html = html.replace( /<base[^>]*>/gi, '' ); | |
| // Convert relative links to absolute Wikipedia URLs | |
| html = html.replace( /href="\.\/([^"]+)"/gi, 'href="https://en.wikipedia.org/wiki/$1"' ); | |
| // Convert protocol-relative URLs to https | |
| html = html.replace( /src="\/\//gi, 'src="https://' ); | |
| html = html.replace( /href="\/\//gi, 'href="https://' ); | |
| html = html.replace( /srcset="([^"]*)"/gi, ( _, srcset ) => { | |
| const fixed = srcset | |
| .replace( /^\/\//g, 'https://' ) | |
| .replace( /(\s)\/\//g, '$1https://' ); | |
| return `srcset="${ fixed }"`; | |
| } ); | |
| // Add lazy loading to images | |
| html = html.replace( /<img /gi, '<img loading="lazy" ' ); | |
| // Extract title from the HTML | |
| const titleMatch = html.match( /<title>([^<]+)<\/title>/ ); | |
| const extractedTitle = titleMatch ? titleMatch[ 1 ] : title; | |
| return { | |
| html, | |
| title: extractedTitle, | |
| displayTitle: extractedTitle, | |
| sections: [] // REST API doesn't return sections separately, but they're in the HTML | |
| }; | |
| } catch ( error ) { | |
| if ( error.message.includes( '404' ) ) { | |
| return null; | |
| } | |
| throw error; | |
| } | |
| } | |
| /** | |
| * Search Wikipedia articles | |
| */ | |
| export async function searchArticles( query, limit = 10 ) { | |
| if ( !query || query.trim() === '' ) { | |
| return []; | |
| } | |
| const url = `https://en.wikipedia.org/w/api.php?action=query&format=json&generator=prefixsearch&gpssearch=${ encodeURIComponent( query ) }&gpsnamespace=0&gpslimit=${ limit }&prop=description`; | |
| const response = await fetchJSON( url ); | |
| if ( !response.query || !response.query.pages ) { | |
| return []; | |
| } | |
| return Object.values( response.query.pages ) | |
| .sort( ( a, b ) => a.index - b.index ) | |
| .map( ( page ) => ( { | |
| title: page.title, | |
| description: page.description || '' | |
| } ) ); | |
| } | |