Eric Gardner commited on
Commit
7ecba03
·
1 Parent(s): cdc50ff

Use Wikipedia REST API for article content

Browse files
Files changed (3) hide show
  1. index.js +4 -1
  2. services/chunker.js +2 -1
  3. services/wikipedia.js +27 -34
index.js CHANGED
@@ -7,7 +7,10 @@ import { initEmbedder } from './services/embedder.js';
7
  const app = express();
8
  const PORT = process.env.PORT || 3000;
9
 
10
- app.use( cors() );
 
 
 
11
  app.use( express.json() );
12
 
13
  // Routes
 
7
  const app = express();
8
  const PORT = process.env.PORT || 3000;
9
 
10
+ app.use( cors( {
11
+ origin: true,
12
+ credentials: true
13
+ } ) );
14
  app.use( express.json() );
15
 
16
  // Routes
services/chunker.js CHANGED
@@ -27,7 +27,8 @@ export function chunkArticle( html, sections = [] ) {
27
  }
28
  } );
29
 
30
- const container = doc.querySelector( '.mw-parser-output' );
 
31
  if ( !container ) {
32
  return chunks;
33
  }
 
27
  }
28
  } );
29
 
30
+ // Parsoid REST API puts mw-parser-output class on body, Action API wraps in a div
31
+ const container = doc.querySelector( '.mw-parser-output' ) || doc.body;
32
  if ( !container ) {
33
  return chunks;
34
  }
services/wikipedia.js CHANGED
@@ -12,6 +12,18 @@ async function fetchJSON( url ) {
12
  return response.json();
13
  }
14
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  /**
16
  * Fetch article metadata including revision ID using the Action API
17
  */
@@ -52,46 +64,26 @@ export async function getArticleMetadata( title ) {
52
  }
53
 
54
  /**
55
- * Fetch parsed HTML for an article using the Action API with mobile format and Parsoid
56
- * Returns cleaner HTML optimized for mobile viewing (no infoboxes, cleaner structure)
57
  */
58
  export async function getArticleHtml( title ) {
59
- const params = new URLSearchParams( {
60
- action: 'parse',
61
- format: 'json',
62
- page: title,
63
- prop: 'text|sections|displaytitle',
64
- redirects: '1',
65
- disableeditsection: '1',
66
- disabletoc: '1',
67
- mobileformat: '1',
68
- useparsoid: '1'
69
- } );
70
-
71
- const url = `https://en.wikipedia.org/w/api.php?${ params }`;
72
 
73
  try {
74
- const data = await fetchJSON( url );
75
-
76
- if ( data.error ) {
77
- if ( data.error.code === 'missingtitle' ) {
78
- return null;
79
- }
80
- throw new Error( data.error.info );
81
- }
82
 
83
- let html = data.parse.text[ '*' ];
 
84
 
85
  // Convert relative links to absolute Wikipedia URLs
86
- html = html.replace( /href="\/wiki\//gi, 'href="https://en.wikipedia.org/wiki/' );
87
- html = html.replace( /href="\/w\//gi, 'href="https://en.wikipedia.org/w/' );
88
  html = html.replace( /href="\.\/([^"]+)"/gi, 'href="https://en.wikipedia.org/wiki/$1"' );
89
 
90
- // Convert protocol-relative URLs to https (only at start of URL, not ://)
91
  html = html.replace( /src="\/\//gi, 'src="https://' );
92
- // Handle srcset with multiple entries - only replace // at start of URLs, not :// in the middle
93
  html = html.replace( /srcset="([^"]*)"/gi, ( _, srcset ) => {
94
- // Replace space followed by // (new URL entry) or start of string //
95
  const fixed = srcset
96
  .replace( /^\/\//g, 'https://' )
97
  .replace( /(\s)\/\//g, '$1https://' );
@@ -101,14 +93,15 @@ export async function getArticleHtml( title ) {
101
  // Add lazy loading to images
102
  html = html.replace( /<img /gi, '<img loading="lazy" ' );
103
 
104
- // Wrap in a container div with class for styling
105
- html = `<div class="mw-parser-output">${ html }</div>`;
 
106
 
107
  return {
108
  html,
109
- title: data.parse.title,
110
- displayTitle: data.parse.displaytitle,
111
- sections: data.parse.sections
112
  };
113
  } catch ( error ) {
114
  if ( error.message.includes( '404' ) ) {
 
12
  return response.json();
13
  }
14
 
15
+ async function fetchHTML( url ) {
16
+ const response = await fetch( url, {
17
+ headers: { 'User-Agent': USER_AGENT }
18
+ } );
19
+
20
+ if ( !response.ok ) {
21
+ throw new Error( `HTTP ${ response.status }: ${ response.statusText }` );
22
+ }
23
+
24
+ return response.text();
25
+ }
26
+
27
  /**
28
  * Fetch article metadata including revision ID using the Action API
29
  */
 
64
  }
65
 
66
  /**
67
+ * Fetch parsed HTML for an article using the REST API (Parsoid)
68
+ * Returns HTML with proper section structure for chunking
69
  */
70
  export async function getArticleHtml( title ) {
71
+ const encodedTitle = encodeURIComponent( title.replace( / /g, '_' ) );
72
+ const url = `https://en.wikipedia.org/api/rest_v1/page/html/${ encodedTitle }`;
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  try {
75
+ let html = await fetchHTML( url );
 
 
 
 
 
 
 
76
 
77
+ // Remove the <base> tag to prevent it from affecting URL resolution in the client
78
+ html = html.replace( /<base[^>]*>/gi, '' );
79
 
80
  // Convert relative links to absolute Wikipedia URLs
 
 
81
  html = html.replace( /href="\.\/([^"]+)"/gi, 'href="https://en.wikipedia.org/wiki/$1"' );
82
 
83
+ // Convert protocol-relative URLs to https
84
  html = html.replace( /src="\/\//gi, 'src="https://' );
85
+ html = html.replace( /href="\/\//gi, 'href="https://' );
86
  html = html.replace( /srcset="([^"]*)"/gi, ( _, srcset ) => {
 
87
  const fixed = srcset
88
  .replace( /^\/\//g, 'https://' )
89
  .replace( /(\s)\/\//g, '$1https://' );
 
93
  // Add lazy loading to images
94
  html = html.replace( /<img /gi, '<img loading="lazy" ' );
95
 
96
+ // Extract title from the HTML
97
+ const titleMatch = html.match( /<title>([^<]+)<\/title>/ );
98
+ const extractedTitle = titleMatch ? titleMatch[ 1 ] : title;
99
 
100
  return {
101
  html,
102
+ title: extractedTitle,
103
+ displayTitle: extractedTitle,
104
+ sections: [] // REST API doesn't return sections separately, but they're in the HTML
105
  };
106
  } catch ( error ) {
107
  if ( error.message.includes( '404' ) ) {