distill-pipeline / src /question /question_core.mjs
htaf's picture
tightened up question limits
68e4117
// src/question/question_core.mjs
import { preview } from '../pipeline/util.mjs';
/**
* Safely parse JSON. Returns:
* - a parsed value on success
* - null on failure (and optionally an error object if needed)
*/
function tryParseJson(raw) {
if (!raw || typeof raw !== 'string') return null;
const trimmed = raw.trim();
// Quick sanity: must start with { or [
if (!trimmed.startsWith('{') && !trimmed.startsWith('[')) {
return null;
}
try {
return JSON.parse(trimmed);
} catch {
return null;
}
}
/**
* Extract questions from a plain-text response.
*
* This is designed to handle real LLM outputs like:
*
* What is the primary purpose of practicing presence according to the text?
* How does Q'uo characterize the physical vehicle's limitations?
* What is the role of pain and struggle in spiritual growth?
*
* as well as numbered/bulleted lists:
*
* 1. What is ... ?
* - How does ... ?
* * Why is ... ?
*/
function extractQuestionsFromText(rawText) {
if (!rawText || typeof rawText !== 'string') return [];
// Strip trivial XML/HTML-ish tags like <analysis>, <reasoning>, etc.
const stripped = rawText.replace(/<\/?[a-zA-Z0-9_:-]+>/g, ' ');
const lines = stripped
.split(/\r?\n/)
.map((l) => l.trim())
.filter(Boolean);
const questions = [];
for (const line of lines) {
// Must contain a question mark somewhere
if (!line.includes('?')) continue;
// Common prefixes: "1. ", "1) ", "- ", "* "
const cleaned = line.replace(/^(?:\d+\s*[.)]\s*|[-*]\s*)/, '').trim();
// Take up to the first '?' as the end of the question
const qPart = cleaned.split('?')[0].trim();
if (!qPart) continue;
const q = (qPart + '?').trim();
// Filter out tiny or degenerate things
if (q.length < 10) continue;
if (!/[a-zA-Z]/.test(q)) continue;
questions.push(q);
}
// If we didn't find anything line-based, optional fallback:
// try to split the whole text by '?' and recover sentence-like chunks.
if (questions.length === 0) {
const segments = stripped.split('?');
for (let i = 0; i < segments.length - 1; i++) {
const seg = segments[i].trim();
if (!seg) continue;
// Consider only reasonable-length segments
if (seg.length < 10) continue;
const candidate = seg + '?';
if (!/[a-zA-Z]/.test(candidate)) continue;
questions.push(candidate);
}
}
// Deduplicate while preserving order
const seen = new Set();
const deduped = [];
for (const q of questions) {
if (seen.has(q)) continue;
seen.add(q);
deduped.push(q);
}
return deduped;
}
/**
* Core helper: take raw model string and return:
* {
* questions: string[],
* raw: string,
* parsed: any | { error: 'invalid_json', rawSnippet?: string }
* }
*
* - Tries JSON first: { questions: [...] } or [...] array root.
* - If JSON fails, falls back to text-based extraction.
*/
export function parseQuestionResponse(raw, { maxQuestions } = {}) {
const result = {
questions: [],
raw: raw ?? '',
parsed: null,
};
if (!raw || typeof raw !== 'string') {
result.parsed = { error: 'empty_response' };
return result;
}
const parsed = tryParseJson(raw);
if (parsed != null) {
result.parsed = parsed;
// Case 1: { questions: [...] }
if (
parsed &&
typeof parsed === 'object' &&
Array.isArray(parsed.questions)
) {
const qs = parsed.questions
.map((q) => (typeof q === 'string' ? q.trim() : ''))
.filter((q) => q && q.endsWith('?'));
result.questions = maxQuestions
? qs.slice(0, maxQuestions)
: qs;
return result;
}
// Case 2: array root
if (Array.isArray(parsed)) {
const qs = parsed
.map((item) => {
if (typeof item === 'string') return item.trim();
if (item && typeof item === 'object') {
if (typeof item.question === 'string') {
return item.question.trim();
}
if (typeof item.question_text === 'string') {
return item.question_text.trim();
}
}
return '';
})
.filter((q) => q && q.endsWith('?'));
result.questions = maxQuestions
? qs.slice(0, maxQuestions)
: qs;
return result;
}
// Parsed JSON but not in a recognized shape
result.parsed = {
error: 'unrecognized_json_shape',
rawSnippet: preview(raw, 200),
};
} else {
// Not valid JSON at all
result.parsed = {
error: 'invalid_json',
rawSnippet: preview(raw, 200),
};
}
// Fallback: extract questions from plain text
const textQs = extractQuestionsFromText(raw);
result.questions = maxQuestions
? textQs.slice(0, maxQuestions)
: textQs;
return result;
}
/**
* High-level helper used by the pipeline:
*
* const { questions, raw, parsed } = await runQuestionGenerator(contextText, provider, { maxQuestions })
*/
export async function runQuestionGenerator(
contextText,
provider,
{ maxQuestions = 5 } = {},
) {
if (!provider || typeof provider.generate !== 'function') {
throw new Error('Question provider must implement .generate(prompt)');
}
if (!contextText || !contextText.trim()) {
return { questions: [], raw: '', parsed: { error: 'empty_context' } };
}
// Minimal built-in prompt; if you have a richer prompt file, you can
// load it and inject {{CONTEXT}} before calling provider.generate.
const prompt = [
'You are a question generation assistant.',
'',
'You will be given a chunk of spiritual teaching text as CONTEXT.',
'Generate diverse, high-quality questions that:',
'- are answerable from the context only,',
'- require some thinking, not just copying a sentence,',
'- are phrased as clear, direct questions.',
'',
'Return either:',
'- JSON: { "questions": ["Q1?", "Q2?", ...] }',
' or an array of question-like objects/strings; OR',
'- Plain text with one question per line.',
'',
'---',
'CONTEXT:',
contextText,
'---',
].join('\n');
const raw = await provider.generate(prompt);
const parsed = parseQuestionResponse(raw, { maxQuestions });
return parsed;
}