Spaces:
Sleeping
Sleeping
Eric Gardner
commited on
Commit
·
7ecba03
1
Parent(s):
cdc50ff
Use Wikipedia REST API for article content
Browse files- index.js +4 -1
- services/chunker.js +2 -1
- services/wikipedia.js +27 -34
index.js
CHANGED
|
@@ -7,7 +7,10 @@ import { initEmbedder } from './services/embedder.js';
|
|
| 7 |
const app = express();
|
| 8 |
const PORT = process.env.PORT || 3000;
|
| 9 |
|
| 10 |
-
app.use( cors(
|
|
|
|
|
|
|
|
|
|
| 11 |
app.use( express.json() );
|
| 12 |
|
| 13 |
// Routes
|
|
|
|
| 7 |
const app = express();
|
| 8 |
const PORT = process.env.PORT || 3000;
|
| 9 |
|
| 10 |
+
app.use( cors( {
|
| 11 |
+
origin: true,
|
| 12 |
+
credentials: true
|
| 13 |
+
} ) );
|
| 14 |
app.use( express.json() );
|
| 15 |
|
| 16 |
// Routes
|
services/chunker.js
CHANGED
|
@@ -27,7 +27,8 @@ export function chunkArticle( html, sections = [] ) {
|
|
| 27 |
}
|
| 28 |
} );
|
| 29 |
|
| 30 |
-
|
|
|
|
| 31 |
if ( !container ) {
|
| 32 |
return chunks;
|
| 33 |
}
|
|
|
|
| 27 |
}
|
| 28 |
} );
|
| 29 |
|
| 30 |
+
// Parsoid REST API puts mw-parser-output class on body, Action API wraps in a div
|
| 31 |
+
const container = doc.querySelector( '.mw-parser-output' ) || doc.body;
|
| 32 |
if ( !container ) {
|
| 33 |
return chunks;
|
| 34 |
}
|
services/wikipedia.js
CHANGED
|
@@ -12,6 +12,18 @@ async function fetchJSON( url ) {
|
|
| 12 |
return response.json();
|
| 13 |
}
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
/**
|
| 16 |
* Fetch article metadata including revision ID using the Action API
|
| 17 |
*/
|
|
@@ -52,46 +64,26 @@ export async function getArticleMetadata( title ) {
|
|
| 52 |
}
|
| 53 |
|
| 54 |
/**
|
| 55 |
-
* Fetch parsed HTML for an article using the
|
| 56 |
-
* Returns
|
| 57 |
*/
|
| 58 |
export async function getArticleHtml( title ) {
|
| 59 |
-
const
|
| 60 |
-
|
| 61 |
-
format: 'json',
|
| 62 |
-
page: title,
|
| 63 |
-
prop: 'text|sections|displaytitle',
|
| 64 |
-
redirects: '1',
|
| 65 |
-
disableeditsection: '1',
|
| 66 |
-
disabletoc: '1',
|
| 67 |
-
mobileformat: '1',
|
| 68 |
-
useparsoid: '1'
|
| 69 |
-
} );
|
| 70 |
-
|
| 71 |
-
const url = `https://en.wikipedia.org/w/api.php?${ params }`;
|
| 72 |
|
| 73 |
try {
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
if ( data.error ) {
|
| 77 |
-
if ( data.error.code === 'missingtitle' ) {
|
| 78 |
-
return null;
|
| 79 |
-
}
|
| 80 |
-
throw new Error( data.error.info );
|
| 81 |
-
}
|
| 82 |
|
| 83 |
-
|
|
|
|
| 84 |
|
| 85 |
// Convert relative links to absolute Wikipedia URLs
|
| 86 |
-
html = html.replace( /href="\/wiki\//gi, 'href="https://en.wikipedia.org/wiki/' );
|
| 87 |
-
html = html.replace( /href="\/w\//gi, 'href="https://en.wikipedia.org/w/' );
|
| 88 |
html = html.replace( /href="\.\/([^"]+)"/gi, 'href="https://en.wikipedia.org/wiki/$1"' );
|
| 89 |
|
| 90 |
-
// Convert protocol-relative URLs to https
|
| 91 |
html = html.replace( /src="\/\//gi, 'src="https://' );
|
| 92 |
-
|
| 93 |
html = html.replace( /srcset="([^"]*)"/gi, ( _, srcset ) => {
|
| 94 |
-
// Replace space followed by // (new URL entry) or start of string //
|
| 95 |
const fixed = srcset
|
| 96 |
.replace( /^\/\//g, 'https://' )
|
| 97 |
.replace( /(\s)\/\//g, '$1https://' );
|
|
@@ -101,14 +93,15 @@ export async function getArticleHtml( title ) {
|
|
| 101 |
// Add lazy loading to images
|
| 102 |
html = html.replace( /<img /gi, '<img loading="lazy" ' );
|
| 103 |
|
| 104 |
-
//
|
| 105 |
-
|
|
|
|
| 106 |
|
| 107 |
return {
|
| 108 |
html,
|
| 109 |
-
title:
|
| 110 |
-
displayTitle:
|
| 111 |
-
sections:
|
| 112 |
};
|
| 113 |
} catch ( error ) {
|
| 114 |
if ( error.message.includes( '404' ) ) {
|
|
|
|
| 12 |
return response.json();
|
| 13 |
}
|
| 14 |
|
| 15 |
+
async function fetchHTML( url ) {
|
| 16 |
+
const response = await fetch( url, {
|
| 17 |
+
headers: { 'User-Agent': USER_AGENT }
|
| 18 |
+
} );
|
| 19 |
+
|
| 20 |
+
if ( !response.ok ) {
|
| 21 |
+
throw new Error( `HTTP ${ response.status }: ${ response.statusText }` );
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
return response.text();
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
/**
|
| 28 |
* Fetch article metadata including revision ID using the Action API
|
| 29 |
*/
|
|
|
|
| 64 |
}
|
| 65 |
|
| 66 |
/**
|
| 67 |
+
* Fetch parsed HTML for an article using the REST API (Parsoid)
|
| 68 |
+
* Returns HTML with proper section structure for chunking
|
| 69 |
*/
|
| 70 |
export async function getArticleHtml( title ) {
|
| 71 |
+
const encodedTitle = encodeURIComponent( title.replace( / /g, '_' ) );
|
| 72 |
+
const url = `https://en.wikipedia.org/api/rest_v1/page/html/${ encodedTitle }`;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
try {
|
| 75 |
+
let html = await fetchHTML( url );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
+
// Remove the <base> tag to prevent it from affecting URL resolution in the client
|
| 78 |
+
html = html.replace( /<base[^>]*>/gi, '' );
|
| 79 |
|
| 80 |
// Convert relative links to absolute Wikipedia URLs
|
|
|
|
|
|
|
| 81 |
html = html.replace( /href="\.\/([^"]+)"/gi, 'href="https://en.wikipedia.org/wiki/$1"' );
|
| 82 |
|
| 83 |
+
// Convert protocol-relative URLs to https
|
| 84 |
html = html.replace( /src="\/\//gi, 'src="https://' );
|
| 85 |
+
html = html.replace( /href="\/\//gi, 'href="https://' );
|
| 86 |
html = html.replace( /srcset="([^"]*)"/gi, ( _, srcset ) => {
|
|
|
|
| 87 |
const fixed = srcset
|
| 88 |
.replace( /^\/\//g, 'https://' )
|
| 89 |
.replace( /(\s)\/\//g, '$1https://' );
|
|
|
|
| 93 |
// Add lazy loading to images
|
| 94 |
html = html.replace( /<img /gi, '<img loading="lazy" ' );
|
| 95 |
|
| 96 |
+
// Extract title from the HTML
|
| 97 |
+
const titleMatch = html.match( /<title>([^<]+)<\/title>/ );
|
| 98 |
+
const extractedTitle = titleMatch ? titleMatch[ 1 ] : title;
|
| 99 |
|
| 100 |
return {
|
| 101 |
html,
|
| 102 |
+
title: extractedTitle,
|
| 103 |
+
displayTitle: extractedTitle,
|
| 104 |
+
sections: [] // REST API doesn't return sections separately, but they're in the HTML
|
| 105 |
};
|
| 106 |
} catch ( error ) {
|
| 107 |
if ( error.message.includes( '404' ) ) {
|