Spaces:

egardner
/

question-explorer-api

Sleeping

App Files Files Community

Eric Gardner commited on Dec 18, 2025

Commit

be647a4

1 Parent(s): 7ecba03

Use Flan-T5-base to pre-generate questions

Browse files

Files changed (3) hide show

index.js +8 -4
routes/article.js +42 -2
services/questionGenerator.js +121 -0

index.js CHANGED Viewed

@@ -3,6 +3,7 @@ import cors from 'cors';
 import articleRoutes from './routes/article.js';
 import searchRoutes from './routes/search.js';
 import { initEmbedder } from './services/embedder.js';
 const app = express();
 const PORT = process.env.PORT || 3000;
@@ -22,13 +23,16 @@ app.get( '/api/health', ( _, res ) => {
 	res.json( { status: 'ok' } );
 } );
-// Pre-warm the embedding model on startup
-console.log( 'Starting server and loading embedding model...' );
-initEmbedder().then( () => {
 	app.listen( PORT, () => {
 		console.log( `Server running on http://localhost:${ PORT }` );
 	} );
 } ).catch( ( err ) => {
-	console.error( 'Failed to initialize embedder:', err );
 	process.exit( 1 );
 } );

 import articleRoutes from './routes/article.js';
 import searchRoutes from './routes/search.js';
 import { initEmbedder } from './services/embedder.js';
+import { initQuestionGenerator } from './services/questionGenerator.js';
 const app = express();
 const PORT = process.env.PORT || 3000;
 	res.json( { status: 'ok' } );
 } );
+// Pre-warm the models on startup
+console.log( 'Starting server and loading models...' );
+Promise.all( [
+	initEmbedder(),
+	initQuestionGenerator()
+] ).then( () => {
 	app.listen( PORT, () => {
 		console.log( `Server running on http://localhost:${ PORT }` );
 	} );
 } ).catch( ( err ) => {
+	console.error( 'Failed to initialize models:', err );
 	process.exit( 1 );
 } );

routes/article.js CHANGED Viewed

@@ -5,6 +5,7 @@ import { embedTexts, embedSingle } from '../services/embedder.js';
 import { search } from '../services/vectorSearch.js';
 import { getCached, setCache, isCacheValid } from '../services/cache.js';
 import { getProcessingState, setProcessing } from '../services/processingState.js';
 const router = Router();
@@ -32,7 +33,8 @@ router.get( '/:title', async ( req, res ) => {
 				revisionId: cached.revisionId,
 				html: cached.html,
 				status: 'ready',
-				chunkCount: cached.chunks.length
 			} );
 		}
@@ -202,6 +204,43 @@ async function processArticle( title, revisionId ) {
 		chunk.embedding = embeddings[ i ];
 	} );
 	// Save to cache
 	await setCache( title, {
 		title: articleData.title,
@@ -210,7 +249,8 @@ async function processArticle( title, revisionId ) {
 		fetchedAt: new Date().toISOString(),
 		html,
 		chunkCount: chunks.length,
-		chunks
 	} );
 	setProcessing( title, 'ready' );

 import { search } from '../services/vectorSearch.js';
 import { getCached, setCache, isCacheValid } from '../services/cache.js';
 import { getProcessingState, setProcessing } from '../services/processingState.js';
+import { generateQuestions, getLeadSectionText } from '../services/questionGenerator.js';
 const router = Router();
 				revisionId: cached.revisionId,
 				html: cached.html,
 				status: 'ready',
+				chunkCount: cached.chunks.length,
+				suggestedQuestions: cached.suggestedQuestions || []
 			} );
 		}
 		chunk.embedding = embeddings[ i ];
 	} );
+	// Generate suggested questions from the lead section
+	let suggestedQuestions = [];
+	try {
+		const leadText = getLeadSectionText( chunks );
+		console.log( `Lead text length: ${ leadText.length } chars` );
+		if ( leadText.length > 100 ) {
+			console.log( 'Generating suggested questions...' );
+			const rawQuestions = await generateQuestions( leadText, 5 );
+			console.log( `Raw questions from model:`, rawQuestions );
+			// Validate questions by checking if they match article content
+			const validatedQuestions = [];
+			for ( const question of rawQuestions ) {
+				const questionEmbedding = await embedSingle( question );
+				const { results } = search( questionEmbedding, chunks, 1 );
+				if ( results.length === 0 ) {
+					console.log( `Question: "${ question }" -> no results` );
+					continue;
+				}
+				const score = results[ 0 ].score;
+				console.log( `Question: "${ question }" -> score: ${ score.toFixed( 3 ) }` );
+				// Keep questions that have a good match (score > 0.3)
+				if ( score > 0.3 ) {
+					validatedQuestions.push( question );
+				}
+			}
+			suggestedQuestions = validatedQuestions.slice( 0, 3 );
+			console.log( `Generated ${ suggestedQuestions.length } validated questions` );
+		}
+	} catch ( err ) {
+		console.warn( 'Question generation failed, continuing without suggestions:', err.message );
+	}
 	// Save to cache
 	await setCache( title, {
 		title: articleData.title,
 		fetchedAt: new Date().toISOString(),
 		html,
 		chunkCount: chunks.length,
+		chunks,
+		suggestedQuestions
 	} );
 	setProcessing( title, 'ready' );

services/questionGenerator.js ADDED Viewed

	@@ -0,0 +1,121 @@

+import { pipeline } from '@xenova/transformers';
+let generator = null;
+/**
+ * Initialize the question generation model (Flan-T5-base)
+ */
+export async function initQuestionGenerator() {
+	if ( !generator ) {
+		console.log( 'Loading question generation model (flan-t5-base)...' );
+		generator = await pipeline( 'text2text-generation', 'Xenova/flan-t5-base' );
+		console.log( 'Question generation model loaded.' );
+	}
+	return generator;
+}
+/**
+ * Generate a single question from a text passage
+ *
+ * @param {string} text - The passage to generate a question about
+ * @returns {Promise<string|null>} - Generated question or null
+ */
+async function generateSingleQuestion( text ) {
+	// More specific prompt to encourage factual questions
+	const prompt = `Ask a specific factual question that can be answered by the following passage: ${ text }`;
+	const result = await generator( prompt, {
+		max_new_tokens: 60,
+		num_beams: 2,
+		do_sample: false
+	} );
+	const output = result[ 0 ].generated_text.trim();
+	// Ensure it ends with a question mark
+	if ( output.length > 10 ) {
+		return output.endsWith( '?' ) ? output : output + '?';
+	}
+	return null;
+}
+/**
+ * Group sentences into chunks of N for more context
+ *
+ * @param {string[]} sentences - Array of sentences
+ * @param {number} groupSize - Number of sentences per group
+ * @returns {string[]} - Array of grouped sentence strings
+ */
+function groupSentences( sentences, groupSize = 3 ) {
+	const groups = [];
+	for ( let i = 0; i < sentences.length; i += groupSize ) {
+		const group = sentences.slice( i, i + groupSize ).join( ' ' );
+		groups.push( group );
+	}
+	return groups;
+}
+/**
+ * Generate questions from a text passage
+ *
+ * @param {string} text - The passage to generate questions about
+ * @param {number} numQuestions - Number of questions to generate (default: 5)
+ * @returns {Promise<string[]>} - Array of generated questions
+ */
+export async function generateQuestions( text, numQuestions = 5 ) {
+	if ( !generator ) {
+		await initQuestionGenerator();
+	}
+	// Split text into sentences
+	const sentences = text
+		.split( /(?<=[.!?])\s+/ )
+		.filter( ( s ) => s.length > 30 );
+	// Group sentences (2-3 at a time) for more context per question
+	const chunks = groupSentences( sentences, 2 );
+	// Take a sample of chunks to generate questions from
+	const sampleSize = Math.min( numQuestions * 2, chunks.length );
+	const sampled = chunks.slice( 0, sampleSize );
+	const questions = [];
+	const seen = new Set();
+	try {
+		for ( const chunk of sampled ) {
+			if ( questions.length >= numQuestions ) {
+				break;
+			}
+			const question = await generateSingleQuestion( chunk );
+			if ( question && !seen.has( question.toLowerCase() ) ) {
+				seen.add( question.toLowerCase() );
+				questions.push( question );
+			}
+		}
+		return questions;
+	} catch ( error ) {
+		console.error( 'Question generation failed:', error );
+		return [];
+	}
+}
+/**
+ * Extract lead section text from chunks
+ *
+ * @param {Array} chunks - Article chunks with sectionTitle
+ * @returns {string} - Combined text from the introduction/lead section
+ */
+export function getLeadSectionText( chunks ) {
+	const leadChunks = chunks.filter(
+		( chunk ) => chunk.sectionTitle === 'Introduction' || chunk.sectionId === null
+	);
+	// Take up to first 3 paragraphs from the lead
+	return leadChunks
+		.slice( 0, 3 )
+		.map( ( c ) => c.text )
+		.join( ' ' );
+}