Spaces:
Sleeping
Sleeping
| import { Router } from 'express'; | |
| import { getArticleMetadata, getArticleHtml } from '../services/wikipedia.js'; | |
| import { chunkArticle } from '../services/chunker.js'; | |
| import { embedTexts, embedSingle } from '../services/embedder.js'; | |
| import { search } from '../services/vectorSearch.js'; | |
| import { getCached, setCache, isCacheValid } from '../services/cache.js'; | |
| import { getProcessingState, setProcessing } from '../services/processingState.js'; | |
| import { generateQuestionsWithClaude, isClaudeAvailable } from '../services/claudeQuestionGenerator.js'; | |
| const router = Router(); | |
| /** | |
| * GET /api/article/:title | |
| * Fetch article content; initiates embedding pipeline if not cached | |
| */ | |
| router.get( '/:title', async ( req, res ) => { | |
| try { | |
| const title = decodeURIComponent( req.params.title ); | |
| // Get current revision from Wikipedia | |
| const metadata = await getArticleMetadata( title ); | |
| if ( !metadata ) { | |
| return res.status( 404 ).json( { error: 'Article not found' } ); | |
| } | |
| // Check cache validity | |
| const cacheValid = await isCacheValid( title, metadata.revisionId ); | |
| if ( cacheValid ) { | |
| const cached = await getCached( title ); | |
| return res.json( { | |
| title: cached.title, | |
| revisionId: cached.revisionId, | |
| html: cached.html, | |
| status: 'ready', | |
| chunkCount: cached.chunks.length, | |
| suggestedQuestions: cached.suggestedQuestions || [] | |
| } ); | |
| } | |
| // Check if already processing | |
| const state = getProcessingState( title ); | |
| if ( state.state === 'processing' ) { | |
| return res.json( { | |
| title: metadata.title, | |
| revisionId: metadata.revisionId, | |
| status: 'processing' | |
| } ); | |
| } | |
| // Start async processing | |
| setProcessing( title, 'processing' ); | |
| // Return immediately with processing status | |
| res.json( { | |
| title: metadata.title, | |
| revisionId: metadata.revisionId, | |
| status: 'processing' | |
| } ); | |
| // Process in background | |
| processArticle( title, metadata.revisionId ).catch( ( err ) => { | |
| console.error( `Error processing ${ title }:`, err ); | |
| setProcessing( title, 'error', err.message ); | |
| } ); | |
| } catch ( error ) { | |
| console.error( 'Article fetch error:', error ); | |
| res.status( 500 ).json( { error: 'Failed to fetch article' } ); | |
| } | |
| } ); | |
| /** | |
| * GET /api/article/:title/status | |
| * Poll endpoint for embedding status | |
| */ | |
| router.get( '/:title/status', async ( req, res ) => { | |
| try { | |
| const title = decodeURIComponent( req.params.title ); | |
| const cached = await getCached( title ); | |
| if ( cached ) { | |
| return res.json( { | |
| title: cached.title, | |
| revisionId: cached.revisionId, | |
| status: 'ready', | |
| chunkCount: cached.chunks.length | |
| } ); | |
| } | |
| const state = getProcessingState( title ); | |
| if ( state.state === 'error' ) { | |
| return res.json( { | |
| title, | |
| status: 'error', | |
| error: state.error | |
| } ); | |
| } | |
| if ( state.state === 'processing' ) { | |
| return res.json( { | |
| title, | |
| status: 'processing' | |
| } ); | |
| } | |
| return res.json( { | |
| title, | |
| status: 'unknown' | |
| } ); | |
| } catch ( error ) { | |
| console.error( 'Status check error:', error ); | |
| res.status( 500 ).json( { error: 'Failed to check status' } ); | |
| } | |
| } ); | |
| /** | |
| * POST /api/article/:title/query | |
| * Submit a natural language question | |
| */ | |
| router.post( '/:title/query', async ( req, res ) => { | |
| try { | |
| const title = decodeURIComponent( req.params.title ); | |
| const { question, topK = 3 } = req.body; | |
| if ( !question ) { | |
| return res.status( 400 ).json( { error: 'Missing question' } ); | |
| } | |
| const cached = await getCached( title ); | |
| if ( !cached ) { | |
| const state = getProcessingState( title ); | |
| if ( state.state === 'processing' ) { | |
| return res.status( 503 ).json( { error: 'Article still processing' } ); | |
| } | |
| return res.status( 404 ).json( { error: 'Article not found or not processed' } ); | |
| } | |
| // Embed the question | |
| const queryEmbedding = await embedSingle( question ); | |
| // Search for relevant chunks | |
| const { results, belowThreshold } = search( | |
| queryEmbedding, | |
| cached.chunks, | |
| Math.min( topK, 10 ) | |
| ); | |
| res.json( { | |
| question, | |
| articleTitle: cached.title, | |
| results, | |
| belowThreshold | |
| } ); | |
| } catch ( error ) { | |
| console.error( 'Query error:', error ); | |
| res.status( 500 ).json( { error: 'Query failed' } ); | |
| } | |
| } ); | |
| /** | |
| * Background processing function | |
| */ | |
| async function processArticle( title, revisionId ) { | |
| console.log( `Processing article: ${ title }` ); | |
| // Fetch parsed HTML from Action API | |
| const articleData = await getArticleHtml( title ); | |
| if ( !articleData ) { | |
| throw new Error( 'Failed to fetch article HTML' ); | |
| } | |
| const { html, sections } = articleData; | |
| // Chunk the article | |
| const chunks = chunkArticle( html, sections ); | |
| console.log( `Created ${ chunks.length } chunks for ${ title }` ); | |
| if ( chunks.length === 0 ) { | |
| // Still cache it, but with no chunks | |
| await setCache( title, { | |
| title: articleData.title, | |
| normalizedTitle: title.toLowerCase().replace( / /g, '-' ), | |
| revisionId, | |
| fetchedAt: new Date().toISOString(), | |
| html, | |
| chunkCount: 0, | |
| chunks: [] | |
| } ); | |
| setProcessing( title, 'ready' ); | |
| return; | |
| } | |
| // Generate embeddings for all chunks | |
| const texts = chunks.map( ( c ) => c.text ); | |
| console.log( `Generating embeddings for ${ texts.length } chunks...` ); | |
| const embeddings = await embedTexts( texts ); | |
| // Attach embeddings to chunks | |
| chunks.forEach( ( chunk, i ) => { | |
| chunk.embedding = embeddings[ i ]; | |
| } ); | |
| // Generate suggested questions using Claude | |
| let suggestedQuestions = []; | |
| if ( isClaudeAvailable() ) { | |
| try { | |
| console.log( 'Generating questions with Claude...' ); | |
| const rawQuestions = await generateQuestionsWithClaude( chunks, articleData.title, 5 ); | |
| console.log( `Claude generated questions:`, rawQuestions ); | |
| // Validate questions by checking if they match article content | |
| const validatedQuestions = []; | |
| for ( const question of rawQuestions ) { | |
| const questionEmbedding = await embedSingle( question ); | |
| const { results } = search( questionEmbedding, chunks, 1 ); | |
| if ( results.length === 0 ) { | |
| console.log( `Question: "${ question }" -> no results` ); | |
| continue; | |
| } | |
| const score = results[ 0 ].score; | |
| console.log( `Question: "${ question }" -> score: ${ score.toFixed( 3 ) }` ); | |
| // Keep questions that have a good match (score > 0.3) | |
| if ( score > 0.3 ) { | |
| validatedQuestions.push( question ); | |
| } | |
| } | |
| suggestedQuestions = validatedQuestions.slice( 0, 5 ); | |
| console.log( `Generated ${ suggestedQuestions.length } validated questions` ); | |
| } catch ( err ) { | |
| console.warn( 'Question generation failed, continuing without suggestions:', err.message ); | |
| } | |
| } else { | |
| console.log( 'ANTHROPIC_API_KEY not set, skipping question generation' ); | |
| } | |
| // Save to cache | |
| await setCache( title, { | |
| title: articleData.title, | |
| normalizedTitle: title.toLowerCase().replace( / /g, '-' ), | |
| revisionId, | |
| fetchedAt: new Date().toISOString(), | |
| html, | |
| chunkCount: chunks.length, | |
| chunks, | |
| suggestedQuestions | |
| } ); | |
| setProcessing( title, 'ready' ); | |
| console.log( `Finished processing: ${ title }` ); | |
| } | |
| export default router; | |