Eric Gardner
Use Claude API for question generation
ce30646
import { Router } from 'express';
import { getArticleMetadata, getArticleHtml } from '../services/wikipedia.js';
import { chunkArticle } from '../services/chunker.js';
import { embedTexts, embedSingle } from '../services/embedder.js';
import { search } from '../services/vectorSearch.js';
import { getCached, setCache, isCacheValid } from '../services/cache.js';
import { getProcessingState, setProcessing } from '../services/processingState.js';
import { generateQuestionsWithClaude, isClaudeAvailable } from '../services/claudeQuestionGenerator.js';
const router = Router();
/**
* GET /api/article/:title
* Fetch article content; initiates embedding pipeline if not cached
*/
router.get( '/:title', async ( req, res ) => {
try {
const title = decodeURIComponent( req.params.title );
// Get current revision from Wikipedia
const metadata = await getArticleMetadata( title );
if ( !metadata ) {
return res.status( 404 ).json( { error: 'Article not found' } );
}
// Check cache validity
const cacheValid = await isCacheValid( title, metadata.revisionId );
if ( cacheValid ) {
const cached = await getCached( title );
return res.json( {
title: cached.title,
revisionId: cached.revisionId,
html: cached.html,
status: 'ready',
chunkCount: cached.chunks.length,
suggestedQuestions: cached.suggestedQuestions || []
} );
}
// Check if already processing
const state = getProcessingState( title );
if ( state.state === 'processing' ) {
return res.json( {
title: metadata.title,
revisionId: metadata.revisionId,
status: 'processing'
} );
}
// Start async processing
setProcessing( title, 'processing' );
// Return immediately with processing status
res.json( {
title: metadata.title,
revisionId: metadata.revisionId,
status: 'processing'
} );
// Process in background
processArticle( title, metadata.revisionId ).catch( ( err ) => {
console.error( `Error processing ${ title }:`, err );
setProcessing( title, 'error', err.message );
} );
} catch ( error ) {
console.error( 'Article fetch error:', error );
res.status( 500 ).json( { error: 'Failed to fetch article' } );
}
} );
/**
* GET /api/article/:title/status
* Poll endpoint for embedding status
*/
router.get( '/:title/status', async ( req, res ) => {
try {
const title = decodeURIComponent( req.params.title );
const cached = await getCached( title );
if ( cached ) {
return res.json( {
title: cached.title,
revisionId: cached.revisionId,
status: 'ready',
chunkCount: cached.chunks.length
} );
}
const state = getProcessingState( title );
if ( state.state === 'error' ) {
return res.json( {
title,
status: 'error',
error: state.error
} );
}
if ( state.state === 'processing' ) {
return res.json( {
title,
status: 'processing'
} );
}
return res.json( {
title,
status: 'unknown'
} );
} catch ( error ) {
console.error( 'Status check error:', error );
res.status( 500 ).json( { error: 'Failed to check status' } );
}
} );
/**
* POST /api/article/:title/query
* Submit a natural language question
*/
router.post( '/:title/query', async ( req, res ) => {
try {
const title = decodeURIComponent( req.params.title );
const { question, topK = 3 } = req.body;
if ( !question ) {
return res.status( 400 ).json( { error: 'Missing question' } );
}
const cached = await getCached( title );
if ( !cached ) {
const state = getProcessingState( title );
if ( state.state === 'processing' ) {
return res.status( 503 ).json( { error: 'Article still processing' } );
}
return res.status( 404 ).json( { error: 'Article not found or not processed' } );
}
// Embed the question
const queryEmbedding = await embedSingle( question );
// Search for relevant chunks
const { results, belowThreshold } = search(
queryEmbedding,
cached.chunks,
Math.min( topK, 10 )
);
res.json( {
question,
articleTitle: cached.title,
results,
belowThreshold
} );
} catch ( error ) {
console.error( 'Query error:', error );
res.status( 500 ).json( { error: 'Query failed' } );
}
} );
/**
* Background processing function
*/
async function processArticle( title, revisionId ) {
console.log( `Processing article: ${ title }` );
// Fetch parsed HTML from Action API
const articleData = await getArticleHtml( title );
if ( !articleData ) {
throw new Error( 'Failed to fetch article HTML' );
}
const { html, sections } = articleData;
// Chunk the article
const chunks = chunkArticle( html, sections );
console.log( `Created ${ chunks.length } chunks for ${ title }` );
if ( chunks.length === 0 ) {
// Still cache it, but with no chunks
await setCache( title, {
title: articleData.title,
normalizedTitle: title.toLowerCase().replace( / /g, '-' ),
revisionId,
fetchedAt: new Date().toISOString(),
html,
chunkCount: 0,
chunks: []
} );
setProcessing( title, 'ready' );
return;
}
// Generate embeddings for all chunks
const texts = chunks.map( ( c ) => c.text );
console.log( `Generating embeddings for ${ texts.length } chunks...` );
const embeddings = await embedTexts( texts );
// Attach embeddings to chunks
chunks.forEach( ( chunk, i ) => {
chunk.embedding = embeddings[ i ];
} );
// Generate suggested questions using Claude
let suggestedQuestions = [];
if ( isClaudeAvailable() ) {
try {
console.log( 'Generating questions with Claude...' );
const rawQuestions = await generateQuestionsWithClaude( chunks, articleData.title, 5 );
console.log( `Claude generated questions:`, rawQuestions );
// Validate questions by checking if they match article content
const validatedQuestions = [];
for ( const question of rawQuestions ) {
const questionEmbedding = await embedSingle( question );
const { results } = search( questionEmbedding, chunks, 1 );
if ( results.length === 0 ) {
console.log( `Question: "${ question }" -> no results` );
continue;
}
const score = results[ 0 ].score;
console.log( `Question: "${ question }" -> score: ${ score.toFixed( 3 ) }` );
// Keep questions that have a good match (score > 0.3)
if ( score > 0.3 ) {
validatedQuestions.push( question );
}
}
suggestedQuestions = validatedQuestions.slice( 0, 5 );
console.log( `Generated ${ suggestedQuestions.length } validated questions` );
} catch ( err ) {
console.warn( 'Question generation failed, continuing without suggestions:', err.message );
}
} else {
console.log( 'ANTHROPIC_API_KEY not set, skipping question generation' );
}
// Save to cache
await setCache( title, {
title: articleData.title,
normalizedTitle: title.toLowerCase().replace( / /g, '-' ),
revisionId,
fetchedAt: new Date().toISOString(),
html,
chunkCount: chunks.length,
chunks,
suggestedQuestions
} );
setProcessing( title, 'ready' );
console.log( `Finished processing: ${ title }` );
}
export default router;