import { Router } from 'express'; import { getArticleMetadata, getArticleHtml } from '../services/wikipedia.js'; import { chunkArticle } from '../services/chunker.js'; import { embedTexts, embedSingle } from '../services/embedder.js'; import { search } from '../services/vectorSearch.js'; import { getCached, setCache, isCacheValid } from '../services/cache.js'; import { getProcessingState, setProcessing } from '../services/processingState.js'; import { generateQuestionsWithClaude, isClaudeAvailable } from '../services/claudeQuestionGenerator.js'; const router = Router(); /** * GET /api/article/:title * Fetch article content; initiates embedding pipeline if not cached */ router.get( '/:title', async ( req, res ) => { try { const title = decodeURIComponent( req.params.title ); // Get current revision from Wikipedia const metadata = await getArticleMetadata( title ); if ( !metadata ) { return res.status( 404 ).json( { error: 'Article not found' } ); } // Check cache validity const cacheValid = await isCacheValid( title, metadata.revisionId ); if ( cacheValid ) { const cached = await getCached( title ); return res.json( { title: cached.title, revisionId: cached.revisionId, html: cached.html, status: 'ready', chunkCount: cached.chunks.length, suggestedQuestions: cached.suggestedQuestions || [] } ); } // Check if already processing const state = getProcessingState( title ); if ( state.state === 'processing' ) { return res.json( { title: metadata.title, revisionId: metadata.revisionId, status: 'processing' } ); } // Start async processing setProcessing( title, 'processing' ); // Return immediately with processing status res.json( { title: metadata.title, revisionId: metadata.revisionId, status: 'processing' } ); // Process in background processArticle( title, metadata.revisionId ).catch( ( err ) => { console.error( `Error processing ${ title }:`, err ); setProcessing( title, 'error', err.message ); } ); } catch ( error ) { console.error( 'Article fetch error:', error ); res.status( 500 ).json( { error: 'Failed to fetch article' } ); } } ); /** * GET /api/article/:title/status * Poll endpoint for embedding status */ router.get( '/:title/status', async ( req, res ) => { try { const title = decodeURIComponent( req.params.title ); const cached = await getCached( title ); if ( cached ) { return res.json( { title: cached.title, revisionId: cached.revisionId, status: 'ready', chunkCount: cached.chunks.length } ); } const state = getProcessingState( title ); if ( state.state === 'error' ) { return res.json( { title, status: 'error', error: state.error } ); } if ( state.state === 'processing' ) { return res.json( { title, status: 'processing' } ); } return res.json( { title, status: 'unknown' } ); } catch ( error ) { console.error( 'Status check error:', error ); res.status( 500 ).json( { error: 'Failed to check status' } ); } } ); /** * POST /api/article/:title/query * Submit a natural language question */ router.post( '/:title/query', async ( req, res ) => { try { const title = decodeURIComponent( req.params.title ); const { question, topK = 3 } = req.body; if ( !question ) { return res.status( 400 ).json( { error: 'Missing question' } ); } const cached = await getCached( title ); if ( !cached ) { const state = getProcessingState( title ); if ( state.state === 'processing' ) { return res.status( 503 ).json( { error: 'Article still processing' } ); } return res.status( 404 ).json( { error: 'Article not found or not processed' } ); } // Embed the question const queryEmbedding = await embedSingle( question ); // Search for relevant chunks const { results, belowThreshold } = search( queryEmbedding, cached.chunks, Math.min( topK, 10 ) ); res.json( { question, articleTitle: cached.title, results, belowThreshold } ); } catch ( error ) { console.error( 'Query error:', error ); res.status( 500 ).json( { error: 'Query failed' } ); } } ); /** * Background processing function */ async function processArticle( title, revisionId ) { console.log( `Processing article: ${ title }` ); // Fetch parsed HTML from Action API const articleData = await getArticleHtml( title ); if ( !articleData ) { throw new Error( 'Failed to fetch article HTML' ); } const { html, sections } = articleData; // Chunk the article const chunks = chunkArticle( html, sections ); console.log( `Created ${ chunks.length } chunks for ${ title }` ); if ( chunks.length === 0 ) { // Still cache it, but with no chunks await setCache( title, { title: articleData.title, normalizedTitle: title.toLowerCase().replace( / /g, '-' ), revisionId, fetchedAt: new Date().toISOString(), html, chunkCount: 0, chunks: [] } ); setProcessing( title, 'ready' ); return; } // Generate embeddings for all chunks const texts = chunks.map( ( c ) => c.text ); console.log( `Generating embeddings for ${ texts.length } chunks...` ); const embeddings = await embedTexts( texts ); // Attach embeddings to chunks chunks.forEach( ( chunk, i ) => { chunk.embedding = embeddings[ i ]; } ); // Generate suggested questions using Claude let suggestedQuestions = []; if ( isClaudeAvailable() ) { try { console.log( 'Generating questions with Claude...' ); const rawQuestions = await generateQuestionsWithClaude( chunks, articleData.title, 5 ); console.log( `Claude generated questions:`, rawQuestions ); // Validate questions by checking if they match article content const validatedQuestions = []; for ( const question of rawQuestions ) { const questionEmbedding = await embedSingle( question ); const { results } = search( questionEmbedding, chunks, 1 ); if ( results.length === 0 ) { console.log( `Question: "${ question }" -> no results` ); continue; } const score = results[ 0 ].score; console.log( `Question: "${ question }" -> score: ${ score.toFixed( 3 ) }` ); // Keep questions that have a good match (score > 0.3) if ( score > 0.3 ) { validatedQuestions.push( question ); } } suggestedQuestions = validatedQuestions.slice( 0, 5 ); console.log( `Generated ${ suggestedQuestions.length } validated questions` ); } catch ( err ) { console.warn( 'Question generation failed, continuing without suggestions:', err.message ); } } else { console.log( 'ANTHROPIC_API_KEY not set, skipping question generation' ); } // Save to cache await setCache( title, { title: articleData.title, normalizedTitle: title.toLowerCase().replace( / /g, '-' ), revisionId, fetchedAt: new Date().toISOString(), html, chunkCount: chunks.length, chunks, suggestedQuestions } ); setProcessing( title, 'ready' ); console.log( `Finished processing: ${ title }` ); } export default router;