File size: 3,524 Bytes
cdc50ff
 
 
 
 
 
 
 
 
 
 
 
 
 
7ecba03
 
 
 
 
 
 
 
 
 
 
 
cdc50ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ecba03
 
cdc50ff
 
7ecba03
 
cdc50ff
 
7ecba03
cdc50ff
7ecba03
 
cdc50ff
 
 
 
7ecba03
cdc50ff
7ecba03
cdc50ff
 
 
 
 
 
 
 
 
 
7ecba03
 
 
cdc50ff
 
 
7ecba03
 
 
cdc50ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
const USER_AGENT = 'WikipediaQuestionExplorer/1.0 (prototype)';

async function fetchJSON( url ) {
	const response = await fetch( url, {
		headers: { 'User-Agent': USER_AGENT }
	} );

	if ( !response.ok ) {
		throw new Error( `HTTP ${ response.status }: ${ response.statusText }` );
	}

	return response.json();
}

async function fetchHTML( url ) {
	const response = await fetch( url, {
		headers: { 'User-Agent': USER_AGENT }
	} );

	if ( !response.ok ) {
		throw new Error( `HTTP ${ response.status }: ${ response.statusText }` );
	}

	return response.text();
}

/**
 * Fetch article metadata including revision ID using the Action API
 */
export async function getArticleMetadata( title ) {
	const params = new URLSearchParams( {
		action: 'query',
		format: 'json',
		titles: title,
		prop: 'revisions',
		rvprop: 'ids',
		redirects: '1'
	} );

	const url = `https://en.wikipedia.org/w/api.php?${ params }`;

	try {
		const data = await fetchJSON( url );
		const pages = data.query?.pages;
		if ( !pages ) {
			return null;
		}

		const page = Object.values( pages )[ 0 ];
		if ( page.missing !== undefined ) {
			return null;
		}

		return {
			title: page.title,
			revisionId: page.revisions?.[ 0 ]?.revid
		};
	} catch ( error ) {
		if ( error.message.includes( '404' ) ) {
			return null;
		}
		throw error;
	}
}

/**
 * Fetch parsed HTML for an article using the REST API (Parsoid)
 * Returns HTML with proper section structure for chunking
 */
export async function getArticleHtml( title ) {
	const encodedTitle = encodeURIComponent( title.replace( / /g, '_' ) );
	const url = `https://en.wikipedia.org/api/rest_v1/page/html/${ encodedTitle }`;

	try {
		let html = await fetchHTML( url );

		// Remove the <base> tag to prevent it from affecting URL resolution in the client
		html = html.replace( /<base[^>]*>/gi, '' );

		// Convert relative links to absolute Wikipedia URLs
		html = html.replace( /href="\.\/([^"]+)"/gi, 'href="https://en.wikipedia.org/wiki/$1"' );

		// Convert protocol-relative URLs to https
		html = html.replace( /src="\/\//gi, 'src="https://' );
		html = html.replace( /href="\/\//gi, 'href="https://' );
		html = html.replace( /srcset="([^"]*)"/gi, ( _, srcset ) => {
			const fixed = srcset
				.replace( /^\/\//g, 'https://' )
				.replace( /(\s)\/\//g, '$1https://' );
			return `srcset="${ fixed }"`;
		} );

		// Add lazy loading to images
		html = html.replace( /<img /gi, '<img loading="lazy" ' );

		// Extract title from the HTML
		const titleMatch = html.match( /<title>([^<]+)<\/title>/ );
		const extractedTitle = titleMatch ? titleMatch[ 1 ] : title;

		return {
			html,
			title: extractedTitle,
			displayTitle: extractedTitle,
			sections: [] // REST API doesn't return sections separately, but they're in the HTML
		};
	} catch ( error ) {
		if ( error.message.includes( '404' ) ) {
			return null;
		}
		throw error;
	}
}

/**
 * Search Wikipedia articles
 */
export async function searchArticles( query, limit = 10 ) {
	if ( !query || query.trim() === '' ) {
		return [];
	}

	const url = `https://en.wikipedia.org/w/api.php?action=query&format=json&generator=prefixsearch&gpssearch=${ encodeURIComponent( query ) }&gpsnamespace=0&gpslimit=${ limit }&prop=description`;

	const response = await fetchJSON( url );

	if ( !response.query || !response.query.pages ) {
		return [];
	}

	return Object.values( response.query.pages )
		.sort( ( a, b ) => a.index - b.index )
		.map( ( page ) => ( {
			title: page.title,
			description: page.description || ''
		} ) );
}