Spaces:
Sleeping
Sleeping
File size: 3,524 Bytes
cdc50ff 7ecba03 cdc50ff 7ecba03 cdc50ff 7ecba03 cdc50ff 7ecba03 cdc50ff 7ecba03 cdc50ff 7ecba03 cdc50ff 7ecba03 cdc50ff 7ecba03 cdc50ff 7ecba03 cdc50ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
const USER_AGENT = 'WikipediaQuestionExplorer/1.0 (prototype)';
async function fetchJSON( url ) {
const response = await fetch( url, {
headers: { 'User-Agent': USER_AGENT }
} );
if ( !response.ok ) {
throw new Error( `HTTP ${ response.status }: ${ response.statusText }` );
}
return response.json();
}
async function fetchHTML( url ) {
const response = await fetch( url, {
headers: { 'User-Agent': USER_AGENT }
} );
if ( !response.ok ) {
throw new Error( `HTTP ${ response.status }: ${ response.statusText }` );
}
return response.text();
}
/**
* Fetch article metadata including revision ID using the Action API
*/
export async function getArticleMetadata( title ) {
const params = new URLSearchParams( {
action: 'query',
format: 'json',
titles: title,
prop: 'revisions',
rvprop: 'ids',
redirects: '1'
} );
const url = `https://en.wikipedia.org/w/api.php?${ params }`;
try {
const data = await fetchJSON( url );
const pages = data.query?.pages;
if ( !pages ) {
return null;
}
const page = Object.values( pages )[ 0 ];
if ( page.missing !== undefined ) {
return null;
}
return {
title: page.title,
revisionId: page.revisions?.[ 0 ]?.revid
};
} catch ( error ) {
if ( error.message.includes( '404' ) ) {
return null;
}
throw error;
}
}
/**
* Fetch parsed HTML for an article using the REST API (Parsoid)
* Returns HTML with proper section structure for chunking
*/
export async function getArticleHtml( title ) {
const encodedTitle = encodeURIComponent( title.replace( / /g, '_' ) );
const url = `https://en.wikipedia.org/api/rest_v1/page/html/${ encodedTitle }`;
try {
let html = await fetchHTML( url );
// Remove the <base> tag to prevent it from affecting URL resolution in the client
html = html.replace( /<base[^>]*>/gi, '' );
// Convert relative links to absolute Wikipedia URLs
html = html.replace( /href="\.\/([^"]+)"/gi, 'href="https://en.wikipedia.org/wiki/$1"' );
// Convert protocol-relative URLs to https
html = html.replace( /src="\/\//gi, 'src="https://' );
html = html.replace( /href="\/\//gi, 'href="https://' );
html = html.replace( /srcset="([^"]*)"/gi, ( _, srcset ) => {
const fixed = srcset
.replace( /^\/\//g, 'https://' )
.replace( /(\s)\/\//g, '$1https://' );
return `srcset="${ fixed }"`;
} );
// Add lazy loading to images
html = html.replace( /<img /gi, '<img loading="lazy" ' );
// Extract title from the HTML
const titleMatch = html.match( /<title>([^<]+)<\/title>/ );
const extractedTitle = titleMatch ? titleMatch[ 1 ] : title;
return {
html,
title: extractedTitle,
displayTitle: extractedTitle,
sections: [] // REST API doesn't return sections separately, but they're in the HTML
};
} catch ( error ) {
if ( error.message.includes( '404' ) ) {
return null;
}
throw error;
}
}
/**
* Search Wikipedia articles
*/
export async function searchArticles( query, limit = 10 ) {
if ( !query || query.trim() === '' ) {
return [];
}
const url = `https://en.wikipedia.org/w/api.php?action=query&format=json&generator=prefixsearch&gpssearch=${ encodeURIComponent( query ) }&gpsnamespace=0&gpslimit=${ limit }&prop=description`;
const response = await fetchJSON( url );
if ( !response.query || !response.query.pages ) {
return [];
}
return Object.values( response.query.pages )
.sort( ( a, b ) => a.index - b.index )
.map( ( page ) => ( {
title: page.title,
description: page.description || ''
} ) );
}
|