File size: 6,294 Bytes

// src/question/question_core.mjs
import { preview } from '../pipeline/util.mjs';

/**
 * Safely parse JSON. Returns:
 *   - a parsed value on success
 *   - null on failure (and optionally an error object if needed)
 */
function tryParseJson(raw) {
  if (!raw || typeof raw !== 'string') return null;
  const trimmed = raw.trim();

  // Quick sanity: must start with { or [
  if (!trimmed.startsWith('{') && !trimmed.startsWith('[')) {
    return null;
  }

  try {
    return JSON.parse(trimmed);
  } catch {
    return null;
  }
}

/**
 * Extract questions from a plain-text response.
 *
 * This is designed to handle real LLM outputs like:
 *
 *   What is the primary purpose of practicing presence according to the text?
 *   How does Q'uo characterize the physical vehicle's limitations?
 *   What is the role of pain and struggle in spiritual growth?
 *
 * as well as numbered/bulleted lists:
 *
 *   1. What is ... ?
 *   - How does ... ?
 *   * Why is ... ?
 */
function extractQuestionsFromText(rawText) {
  if (!rawText || typeof rawText !== 'string') return [];

  // Strip trivial XML/HTML-ish tags like <analysis>, <reasoning>, etc.
  const stripped = rawText.replace(/<\/?[a-zA-Z0-9_:-]+>/g, ' ');

  const lines = stripped
    .split(/\r?\n/)
    .map((l) => l.trim())
    .filter(Boolean);

  const questions = [];

  for (const line of lines) {
    // Must contain a question mark somewhere
    if (!line.includes('?')) continue;

    // Common prefixes: "1. ", "1) ", "- ", "* "
    const cleaned = line.replace(/^(?:\d+\s*[.)]\s*|[-*]\s*)/, '').trim();

    // Take up to the first '?' as the end of the question
    const qPart = cleaned.split('?')[0].trim();
    if (!qPart) continue;

    const q = (qPart + '?').trim();

    // Filter out tiny or degenerate things
    if (q.length < 10) continue;
    if (!/[a-zA-Z]/.test(q)) continue;

    questions.push(q);
  }

  // If we didn't find anything line-based, optional fallback:
  // try to split the whole text by '?' and recover sentence-like chunks.
  if (questions.length === 0) {
    const segments = stripped.split('?');
    for (let i = 0; i < segments.length - 1; i++) {
      const seg = segments[i].trim();
      if (!seg) continue;
      // Consider only reasonable-length segments
      if (seg.length < 10) continue;
      const candidate = seg + '?';
      if (!/[a-zA-Z]/.test(candidate)) continue;
      questions.push(candidate);
    }
  }

  // Deduplicate while preserving order
  const seen = new Set();
  const deduped = [];
  for (const q of questions) {
    if (seen.has(q)) continue;
    seen.add(q);
    deduped.push(q);
  }

  return deduped;
}

/**
 * Core helper: take raw model string and return:
 *   {
 *     questions: string[],
 *     raw: string,
 *     parsed: any | { error: 'invalid_json', rawSnippet?: string }
 *   }
 *
 * - Tries JSON first: { questions: [...] } or [...] array root.
 * - If JSON fails, falls back to text-based extraction.
 */
export function parseQuestionResponse(raw, { maxQuestions } = {}) {
  const result = {
    questions: [],
    raw: raw ?? '',
    parsed: null,
  };

  if (!raw || typeof raw !== 'string') {
    result.parsed = { error: 'empty_response' };
    return result;
  }

  const parsed = tryParseJson(raw);
  if (parsed != null) {
    result.parsed = parsed;

    // Case 1: { questions: [...] }
    if (
      parsed &&
      typeof parsed === 'object' &&
      Array.isArray(parsed.questions)
    ) {
      const qs = parsed.questions
        .map((q) => (typeof q === 'string' ? q.trim() : ''))
        .filter((q) => q && q.endsWith('?'));
      result.questions = maxQuestions
        ? qs.slice(0, maxQuestions)
        : qs;
      return result;
    }

    // Case 2: array root
    if (Array.isArray(parsed)) {
      const qs = parsed
        .map((item) => {
          if (typeof item === 'string') return item.trim();
          if (item && typeof item === 'object') {
            if (typeof item.question === 'string') {
              return item.question.trim();
            }
            if (typeof item.question_text === 'string') {
              return item.question_text.trim();
            }
          }
          return '';
        })
        .filter((q) => q && q.endsWith('?'));
      result.questions = maxQuestions
        ? qs.slice(0, maxQuestions)
        : qs;
      return result;
    }

    // Parsed JSON but not in a recognized shape
    result.parsed = {
      error: 'unrecognized_json_shape',
      rawSnippet: preview(raw, 200),
    };
  } else {
    // Not valid JSON at all
    result.parsed = {
      error: 'invalid_json',
      rawSnippet: preview(raw, 200),
    };
  }

  // Fallback: extract questions from plain text
  const textQs = extractQuestionsFromText(raw);
  result.questions = maxQuestions
    ? textQs.slice(0, maxQuestions)
    : textQs;

  return result;
}

/**
 * High-level helper used by the pipeline:
 *
 *   const { questions, raw, parsed } = await runQuestionGenerator(contextText, provider, { maxQuestions })
 */
export async function runQuestionGenerator(
  contextText,
  provider,
  { maxQuestions = 5 } = {},
) {
  if (!provider || typeof provider.generate !== 'function') {
    throw new Error('Question provider must implement .generate(prompt)');
  }

  if (!contextText || !contextText.trim()) {
    return { questions: [], raw: '', parsed: { error: 'empty_context' } };
  }

  // Minimal built-in prompt; if you have a richer prompt file, you can
  // load it and inject {{CONTEXT}} before calling provider.generate.
  const prompt = [
    'You are a question generation assistant.',
    '',
    'You will be given a chunk of spiritual teaching text as CONTEXT.',
    'Generate diverse, high-quality questions that:',
    '- are answerable from the context only,',
    '- require some thinking, not just copying a sentence,',
    '- are phrased as clear, direct questions.',
    '',
    'Return either:',
    '- JSON: { "questions": ["Q1?", "Q2?", ...] }',
    '  or an array of question-like objects/strings; OR',
    '- Plain text with one question per line.',
    '',
    '---',
    'CONTEXT:',
    contextText,
    '---',
  ].join('\n');

  const raw = await provider.generate(prompt);
  const parsed = parseQuestionResponse(raw, { maxQuestions });

  return parsed;
}