htaf
/

distill-pipeline

retrieval-augmented-generation

self-improvement

question-generation

reward-modeling

Model card Files Files and versions

distill-pipeline / src /question /question_core.mjs

htaf's picture

tightened up question limits

68e4117 3 months ago

history blame contribute delete

6.29 kB

	// src/question/question_core.mjs
	import { preview } from '../pipeline/util.mjs';

	/**
	* Safely parse JSON. Returns:
	* - a parsed value on success
	* - null on failure (and optionally an error object if needed)
	*/
	function tryParseJson(raw) {
	if (!raw \|\| typeof raw !== 'string') return null;
	const trimmed = raw.trim();

	// Quick sanity: must start with { or [
	if (!trimmed.startsWith('{') && !trimmed.startsWith('[')) {
	return null;
	}

	try {
	return JSON.parse(trimmed);
	} catch {
	return null;
	}
	}

	/**
	* Extract questions from a plain-text response.
	*
	* This is designed to handle real LLM outputs like:
	*
	* What is the primary purpose of practicing presence according to the text?
	* How does Q'uo characterize the physical vehicle's limitations?
	* What is the role of pain and struggle in spiritual growth?
	*
	* as well as numbered/bulleted lists:
	*
	* 1. What is ... ?
	* - How does ... ?
	* * Why is ... ?
	*/
	function extractQuestionsFromText(rawText) {
	if (!rawText \|\| typeof rawText !== 'string') return [];

	// Strip trivial XML/HTML-ish tags like <analysis>, <reasoning>, etc.
	const stripped = rawText.replace(/<\/?[a-zA-Z0-9_:-]+>/g, ' ');

	const lines = stripped
	.split(/\r?\n/)
	.map((l) => l.trim())
	.filter(Boolean);

	const questions = [];

	for (const line of lines) {
	// Must contain a question mark somewhere
	if (!line.includes('?')) continue;

	// Common prefixes: "1. ", "1) ", "- ", "* "
	const cleaned = line.replace(/^(?:\d+\s[.)]\s\|[-]\s)/, '').trim();

	// Take up to the first '?' as the end of the question
	const qPart = cleaned.split('?')[0].trim();
	if (!qPart) continue;

	const q = (qPart + '?').trim();

	// Filter out tiny or degenerate things
	if (q.length < 10) continue;
	if (!/[a-zA-Z]/.test(q)) continue;

	questions.push(q);
	}

	// If we didn't find anything line-based, optional fallback:
	// try to split the whole text by '?' and recover sentence-like chunks.
	if (questions.length === 0) {
	const segments = stripped.split('?');
	for (let i = 0; i < segments.length - 1; i++) {
	const seg = segments[i].trim();
	if (!seg) continue;
	// Consider only reasonable-length segments
	if (seg.length < 10) continue;
	const candidate = seg + '?';
	if (!/[a-zA-Z]/.test(candidate)) continue;
	questions.push(candidate);
	}
	}

	// Deduplicate while preserving order
	const seen = new Set();
	const deduped = [];
	for (const q of questions) {
	if (seen.has(q)) continue;
	seen.add(q);
	deduped.push(q);
	}

	return deduped;
	}

	/**
	* Core helper: take raw model string and return:
	* {
	* questions: string[],
	* raw: string,
	* parsed: any \| { error: 'invalid_json', rawSnippet?: string }
	* }
	*
	* - Tries JSON first: { questions: [...] } or [...] array root.
	* - If JSON fails, falls back to text-based extraction.
	*/
	export function parseQuestionResponse(raw, { maxQuestions } = {}) {
	const result = {
	questions: [],
	raw: raw ?? '',
	parsed: null,
	};

	if (!raw \|\| typeof raw !== 'string') {
	result.parsed = { error: 'empty_response' };
	return result;
	}

	const parsed = tryParseJson(raw);
	if (parsed != null) {
	result.parsed = parsed;

	// Case 1: { questions: [...] }
	if (
	parsed &&
	typeof parsed === 'object' &&
	Array.isArray(parsed.questions)
	) {
	const qs = parsed.questions
	.map((q) => (typeof q === 'string' ? q.trim() : ''))
	.filter((q) => q && q.endsWith('?'));
	result.questions = maxQuestions
	? qs.slice(0, maxQuestions)
	: qs;
	return result;
	}

	// Case 2: array root
	if (Array.isArray(parsed)) {
	const qs = parsed
	.map((item) => {
	if (typeof item === 'string') return item.trim();
	if (item && typeof item === 'object') {
	if (typeof item.question === 'string') {
	return item.question.trim();
	}
	if (typeof item.question_text === 'string') {
	return item.question_text.trim();
	}
	}
	return '';
	})
	.filter((q) => q && q.endsWith('?'));
	result.questions = maxQuestions
	? qs.slice(0, maxQuestions)
	: qs;
	return result;
	}

	// Parsed JSON but not in a recognized shape
	result.parsed = {
	error: 'unrecognized_json_shape',
	rawSnippet: preview(raw, 200),
	};
	} else {
	// Not valid JSON at all
	result.parsed = {
	error: 'invalid_json',
	rawSnippet: preview(raw, 200),
	};
	}

	// Fallback: extract questions from plain text
	const textQs = extractQuestionsFromText(raw);
	result.questions = maxQuestions
	? textQs.slice(0, maxQuestions)
	: textQs;

	return result;
	}

	/**
	* High-level helper used by the pipeline:
	*
	* const { questions, raw, parsed } = await runQuestionGenerator(contextText, provider, { maxQuestions })
	*/
	export async function runQuestionGenerator(
	contextText,
	provider,
	{ maxQuestions = 5 } = {},
	) {
	if (!provider \|\| typeof provider.generate !== 'function') {
	throw new Error('Question provider must implement .generate(prompt)');
	}

	if (!contextText \|\| !contextText.trim()) {
	return { questions: [], raw: '', parsed: { error: 'empty_context' } };
	}

	// Minimal built-in prompt; if you have a richer prompt file, you can
	// load it and inject {{CONTEXT}} before calling provider.generate.
	const prompt = [
	'You are a question generation assistant.',
	'',
	'You will be given a chunk of spiritual teaching text as CONTEXT.',
	'Generate diverse, high-quality questions that:',
	'- are answerable from the context only,',
	'- require some thinking, not just copying a sentence,',
	'- are phrased as clear, direct questions.',
	'',
	'Return either:',
	'- JSON: { "questions": ["Q1?", "Q2?", ...] }',
	' or an array of question-like objects/strings; OR',
	'- Plain text with one question per line.',
	'',
	'---',
	'CONTEXT:',
	contextText,
	'---',
	].join('\n');

	const raw = await provider.generate(prompt);
	const parsed = parseQuestionResponse(raw, { maxQuestions });

	return parsed;
	}