tightened up question limits

Browse files

Files changed (12) hide show

AGENTS.md +36 -0
src/core/llm_stage.mjs +70 -0
src/pipeline/batch.mjs +66 -19
src/pipeline/pipeline_cli.js +17 -1
src/pipeline/pipeline_spec.mjs +52 -0
src/pipeline/step.mjs +21 -2
src/question/question_core.mjs +199 -95
src/retrieval/jsonl_chunks.mjs +105 -0
src/retrieval/retrieval.mjs +8 -3
tests/llm_stage.test.mjs +58 -0
tests/ollama_provider_reasoning.test.mjs +54 -0
tests/pipeline_behaviour.test.mjs +182 -0

AGENTS.md ADDED Viewed

	@@ -0,0 +1,36 @@

+# Repository Guidelines
+## Project Structure & Module Organization
+- Core pipeline lives in `src/`, with stage logic in `src/generator`, `src/verifier`, `src/reward`, `src/question`, retrieval helpers in `src/retrieval`, and the CLI entrypoint at `src/pipeline/pipeline_cli.js`.
+- Prompts are in `prompts/`; tweak these before changing stage behaviour.
+- Tests sit in `tests/` (Vitest), with sample seeds in `test_samples/`; pipeline outputs write to `gold/`.
+- Config baselines (models, limits) are in `configs/pipeline.json`; run scripts live at the repo root (`run.sh`, `try_prompt.sh`).
+## Build, Test, and Development Commands
+- `npm install` – install dependencies.
+- `npm run pipeline -- --limit 20 --verbose` – run the default pipeline using static seeds.
+- `PIPELINE_SEED_MODE=question-first npm run pipeline -- --limit 20 --verbose` – enable question-first seeding.
+- `npm test` – run all unit tests (mocked by default).
+- `REAL_ES=1 npm test` – exercise retrieval against a live Elasticsearch + embedding endpoint.
+## Coding Style & Naming Conventions
+- ECMAScript modules (`type: "module"`); prefer `.mjs` for shared code.
+- Two-space indentation, single quotes unless template strings add clarity, and keep functions small and pure where possible (CLI glue stays in `pipeline_cli.js`).
+- Use descriptive, lower_snake or camelCase for variables; exported helpers use camelCase.
+- Keep prompts and stage logic separate; place reusable utilities in `src/pipeline/util.mjs`.
+## Testing Guidelines
+- Vitest is the test runner; add new tests under `tests/` with `.test.mjs` suffix.
+- Mirror stage names in test files (e.g., `generator_core.test.mjs`), and include both happy-path and malformed-input cases.
+- For retrieval, default mocks cover most cases; only opt into `REAL_ES=1` when you have a running distill-rag stack.
+- Aim to keep tests deterministic—mock providers and network calls unless explicitly validating integrations.
+## Commit & Pull Request Guidelines
+- Follow the existing history: short, present-tense summaries (e.g., `add generator test`, `maintain chunk ordering`); include scoped prefixes only when they improve clarity.
+- Keep commits focused (one concern each) and ensure `npm test` passes before pushing.
+- PRs should state the goal, main changes, test evidence, and any required `.env` or config updates; include sample command/output paths when relevant (e.g., `gold/pipeline_gold.jsonl`).
+- Link issues when applicable and note any provider/model assumptions or external services needed for validation.
+## Configuration & Environment Tips
+- Runtime configuration comes from `.env` (ES node, embedding endpoint, provider selections, stage models); avoid committing secrets.
+- When changing model or provider choices, update `configs/pipeline.json` if you want a sharable default, and document overrides in your PR description.

src/core/llm_stage.mjs ADDED Viewed

	@@ -0,0 +1,70 @@

+// src/core/llm_stage.mjs
+import fs from 'fs/promises';
+import path from 'path';
+import { PROJECT_ROOT } from '../pipeline/util.mjs';
+import { loadProviderFor } from '../providers/provider.mjs';
+/**
+ * Load a prompt template from disk.
+ * Paths are relative to the project root by default (e.g. "prompts/question_prompt.txt").
+ */
+async function loadTemplate(templatePath) {
+  const abs = path.isAbsolute(templatePath)
+    ? templatePath
+    : path.join(PROJECT_ROOT, templatePath);
+  const txt = await fs.readFile(abs, 'utf8');
+  return txt;
+}
+/**
+ * Simple {{VAR}} replacement.
+ * Keeps your current prompt style but makes it generic.
+ */
+function renderTemplate(template, vars = {}) {
+  return template.replace(/{{\s*([A-Z0-9_]+)\s*}}/gi, (_, key) => {
+    const v = vars[key];
+    return v == null ? '' : String(v);
+  });
+}
+/**
+ * Generic LLM stage runner.
+ *
+ * @param {object} opts
+ * @param {string} opts.stage       - logical stage name: "question" | "generator" | "verifier" | "reward" | "expert_x"
+ * @param {string} opts.template    - path to prompt template (e.g. "prompts/question_prompt.txt")
+ * @param {object} [opts.vars]      - interpolation vars for the template (QUESTION, CONTEXT, ANSWER, etc.)
+ * @param {object} [opts.provider]  - optional provider instance; if omitted, uses loadProviderFor(stage)
+ * @param {boolean} [opts.verbose]
+ * @param {Console} [opts.logger]
+ *
+ * @returns {Promise<{ prompt: string, raw: string }>}
+ */
+export async function runLLMStage({
+  stage,
+  template,
+  vars = {},
+  provider,
+  verbose = false,
+  logger = console,
+} = {}) {
+  if (!stage) throw new Error('runLLMStage: "stage" is required');
+  if (!template) throw new Error('runLLMStage: "template" is required');
+  const log = logger?.log?.bind(logger) || console.log;
+  const prov = provider || loadProviderFor(stage);
+  const tpl = await loadTemplate(template);
+  const prompt = renderTemplate(tpl, vars);
+  if (verbose) {
+    log(`[llm_stage] stage=${stage}`);
+    log('[llm_stage] prompt preview:\n' + prompt.slice(0, 400));
+  }
+  const raw = await prov.generate(prompt);
+  return { prompt, raw };
+}

src/pipeline/batch.mjs CHANGED Viewed

@@ -8,12 +8,12 @@ import {
   DEFAULT_OUT_PATH,
   loadSeedQuestions,
   seedToQuestion,
-  // seedToContextText,  // <-- no longer needed for QG mode
 } from './seeds.mjs';
 import { runPipelineStep } from './step.mjs';
 import { loadProviderFor } from '../providers/provider.mjs';
 import { runQuestionGenerator } from '../question/question_core.mjs';
 import { fetchChunksFromIndex } from '../retrieval/retrieval.mjs';
 /**
  * Append a single accepted record to a JSONL file.
@@ -30,7 +30,7 @@ export async function appendGoldRecord(outPath, record) {
  *
  * Modes:
  *   - question-first (default): seeds are CHUNKS; we generate questions from each chunk
- *   - static:                seeds are questions (legacy / low-priority mode)
  *
  * Options:
  *   - seedsPath: JSONL of seeds (defaults to test_samples/seed_questions.jsonl)
@@ -56,9 +56,10 @@ export async function runPipelineBatch({
   seedsPath = DEFAULT_SEEDS_PATH,
   outPath = DEFAULT_OUT_PATH,
   limit,
   verbose = false,
   logger = console,
-  seedMode = process.env.PIPELINE_SEED_MODE || 'static',
 } = {}) {
   const log = logger?.log?.bind(logger) || console.log;
   const errLog = logger?.error?.bind(logger) || console.error;
@@ -137,32 +138,76 @@ export async function runPipelineBatch({
   }
   // ----------------------------------------
-  // MODE 2: question-first (ES chunks → QG → pipeline)
   // ----------------------------------------
   if (seedMode === 'question-first') {
     const questionProvider = loadProviderFor('question');
-    // support both env names
     const maxQuestionsPerChunk = Number(
       process.env.QUESTION_MAX_PER_CHUNK ||
         process.env.QUESTION_MAX ||
         '5',
     );
-    const chunkLimit =
-      typeof limit === 'number' ? limit : undefined;
     if (verbose) {
-      log(
-        `[pipeline] fetching chunks from ES (limit=${chunkLimit ?? 'default'})`,
-      );
     }
-    const chunks = await fetchChunksFromIndex(chunkLimit);
     const totalChunks = chunks.length;
     let processedSeeds = 0;
     for (let idx = 0; idx < chunks.length; idx++) {
       const chunk = chunks[idx];
       const label = `[chunk ${idx + 1}/${chunks.length}]`;
       const contextText = chunk.content;
@@ -177,8 +222,10 @@ export async function runPipelineBatch({
       processedSeeds += 1;
       if (verbose) {
-        log(`\n🧩 ${label} generating questions from ES chunk…`);
-        log(`   [question] es _id: ${chunk.id}`);
         log(
           '   [question] chunk preview:\n   ' +
             preview(contextText, 300).replace(/\n/g, '\n   '),
@@ -223,6 +270,7 @@ export async function runPipelineBatch({
       // 2) run full pipeline for each generated question
       for (const q of questions) {
         if (!q || !q.trim()) continue;
         const qLabel = `[q ${processed + 1}]`;
@@ -233,8 +281,7 @@ export async function runPipelineBatch({
         try {
           const result = await runPipelineStep({
             question: q,
-            // 🔑 KEY FIX: reuse this ES chunk as the *only* context
-            initialContext: [chunk],
             verbose,
             logger,
           });
@@ -251,8 +298,8 @@ export async function runPipelineBatch({
             const record = {
               question: q,
               sourceChunkId: chunk.id,
-              sourceChunk: contextText, // raw ES chunk text
-              sourceDoc: chunk.source,  // full ES _source
               context: result.context,
               sample: result.gen,
               verifier: result.ver,
@@ -279,8 +326,8 @@ export async function runPipelineBatch({
     return {
       mode: seedMode,
       total: totalChunks,
-      processed,               // number of questions processed
-      processedSeeds,          // how many chunks we actually used
       processedQuestions: processed,
       accepted,
       outPath,

   DEFAULT_OUT_PATH,
   loadSeedQuestions,
   seedToQuestion,
 } from './seeds.mjs';
 import { runPipelineStep } from './step.mjs';
 import { loadProviderFor } from '../providers/provider.mjs';
 import { runQuestionGenerator } from '../question/question_core.mjs';
 import { fetchChunksFromIndex } from '../retrieval/retrieval.mjs';
+import { loadRagChunks } from '../retrieval/jsonl_chunks.mjs';
 /**
  * Append a single accepted record to a JSONL file.
  *
  * Modes:
  *   - question-first (default): seeds are CHUNKS; we generate questions from each chunk
+ *   - static:                  seeds are questions (legacy / low-priority mode)
  *
  * Options:
  *   - seedsPath: JSONL of seeds (defaults to test_samples/seed_questions.jsonl)
   seedsPath = DEFAULT_SEEDS_PATH,
   outPath = DEFAULT_OUT_PATH,
   limit,
+  chunkLimit,
   verbose = false,
   logger = console,
+  seedMode = process.env.PIPELINE_SEED_MODE || 'question-first',
 } = {}) {
   const log = logger?.log?.bind(logger) || console.log;
   const errLog = logger?.error?.bind(logger) || console.error;
   }
   // ----------------------------------------
+  // MODE 2: question-first (JSONL or ES chunks → QG → pipeline)
   // ----------------------------------------
   if (seedMode === 'question-first') {
     const questionProvider = loadProviderFor('question');
     const maxQuestionsPerChunk = Number(
       process.env.QUESTION_MAX_PER_CHUNK ||
         process.env.QUESTION_MAX ||
         '5',
     );
+    const questionCap =
+      typeof limit === 'number' ? limit : Number.POSITIVE_INFINITY;
+    const effectiveChunkLimit = (() => {
+      if (typeof chunkLimit === 'number') return chunkLimit;
+      const envLimit =
+        process.env.PIPELINE_CHUNK_LIMIT ||
+        process.env.PIPELINE_CHUNKS ||
+        process.env.PIPELINE_CHUNK_SAMPLE;
+      const parsed = envLimit != null ? Number(envLimit) : NaN;
+      return Number.isFinite(parsed) ? parsed : undefined;
+    })();
+    // Decide where chunks come from:
+    //   PIPELINE_CHUNK_SOURCE = 'jsonl' | 'es'
+    const chunkSource =
+      process.env.PIPELINE_CHUNK_SOURCE || 'es';
     if (verbose) {
+      if (chunkSource === 'jsonl') {
+        const p =
+          process.env.RAG_CHUNKS_PATH ||
+          'data/rag_chunks.jsonl';
+        log(
+          `[pipeline] loading chunks from JSONL (${p}), limit=${effectiveChunkLimit ?? 'all'}`,
+        );
+      } else {
+        log(
+          `[pipeline] fetching chunks from ES (limit=${effectiveChunkLimit ?? 'all'})`,
+        );
+      }
+    }
+    let chunks = [];
+    try {
+      if (chunkSource === 'jsonl') {
+        chunks = await loadRagChunks(effectiveChunkLimit);
+      } else {
+        chunks = await fetchChunksFromIndex(effectiveChunkLimit);
+      }
+    } catch (e) {
+      const msg = e?.message || String(e);
+      errLog('[pipeline] ERROR loading chunks:', msg);
+      return {
+        mode: seedMode,
+        total: 0,
+        processed: 0,
+        accepted: 0,
+        outPath,
+        statusCounts: { chunk_load_error: 1 },
+      };
     }
     const totalChunks = chunks.length;
     let processedSeeds = 0;
     for (let idx = 0; idx < chunks.length; idx++) {
+      if (processed >= questionCap) break;
       const chunk = chunks[idx];
       const label = `[chunk ${idx + 1}/${chunks.length}]`;
       const contextText = chunk.content;
       processedSeeds += 1;
       if (verbose) {
+        log(`\n🧩 ${label} generating questions from chunk…`);
+        if (chunk.id) {
+          log(`   [question] chunk id: ${chunk.id}`);
+        }
         log(
           '   [question] chunk preview:\n   ' +
             preview(contextText, 300).replace(/\n/g, '\n   '),
       // 2) run full pipeline for each generated question
       for (const q of questions) {
+        if (processed >= questionCap) break;
         if (!q || !q.trim()) continue;
         const qLabel = `[q ${processed + 1}]`;
         try {
           const result = await runPipelineStep({
             question: q,
+            initialContext: [chunk], // IMPORTANT: reuse SAME chunk, no second retrieval
             verbose,
             logger,
           });
             const record = {
               question: q,
               sourceChunkId: chunk.id,
+              sourceChunk: contextText,
+              sourceDoc: chunk.source,
               context: result.context,
               sample: result.gen,
               verifier: result.ver,
     return {
       mode: seedMode,
       total: totalChunks,
+      processed, // number of questions processed
+      processedSeeds,
       processedQuestions: processed,
       accepted,
       outPath,

src/pipeline/pipeline_cli.js CHANGED Viewed

@@ -26,6 +26,7 @@ function parseArgs(argv) {
   let outPath;
   let verbose = false;
   let seedMode; // optional CLI override
   for (let i = 0; i < args.length; i++) {
     const a = args[i];
@@ -39,6 +40,10 @@ function parseArgs(argv) {
     } else if (a === '--out') {
       outPath = args[i + 1];
       i++;
     } else if (a === '--mode') {
       seedMode = args[i + 1]; // "question-first" | "static"
       i++;
@@ -61,6 +66,7 @@ function parseArgs(argv) {
     outPath: outPath || DEFAULT_OUT,
     verbose,
     seedMode,
   };
 }
@@ -71,6 +77,7 @@ async function main() {
     outPath,
     verbose,
     seedMode: cliSeedMode,
   } = parseArgs(process.argv);
   const generatorProvider = process.env.GENERATOR_PROVIDER || 'ollama';
@@ -105,7 +112,15 @@ async function main() {
   console.log(
     `     reward:    ${rewardProvider} (${rewardModel})`,
   );
-  console.log(`   Limit:    ${limit ?? 'all'}`);
   console.log(`   Verbose:  ${verbose ? 'yes' : 'no'}`);
   console.log('');
@@ -114,6 +129,7 @@ async function main() {
       seedsPath,
       outPath,
       limit,
       verbose,
       logger: console,
       seedMode: mode,

   let outPath;
   let verbose = false;
   let seedMode; // optional CLI override
+  let chunkLimit;
   for (let i = 0; i < args.length; i++) {
     const a = args[i];
     } else if (a === '--out') {
       outPath = args[i + 1];
       i++;
+    } else if (a === '--chunk-limit') {
+      const v = Number(args[i + 1]);
+      if (!Number.isNaN(v)) chunkLimit = v;
+      i++;
     } else if (a === '--mode') {
       seedMode = args[i + 1]; // "question-first" | "static"
       i++;
     outPath: outPath || DEFAULT_OUT,
     verbose,
     seedMode,
+    chunkLimit,
   };
 }
     outPath,
     verbose,
     seedMode: cliSeedMode,
+    chunkLimit,
   } = parseArgs(process.argv);
   const generatorProvider = process.env.GENERATOR_PROVIDER || 'ollama';
   console.log(
     `     reward:    ${rewardProvider} (${rewardModel})`,
   );
+  console.log(`   Question limit: ${limit ?? 'all'}`);
+  if (mode === 'question-first') {
+    const chunkLimitEffective =
+      chunkLimit ??
+      (process.env.PIPELINE_CHUNK_LIMIT ||
+        process.env.PIPELINE_CHUNKS ||
+        process.env.PIPELINE_CHUNK_SAMPLE);
+    console.log(`   Chunk limit:    ${chunkLimitEffective ?? 'all'}`);
+  }
   console.log(`   Verbose:  ${verbose ? 'yes' : 'no'}`);
   console.log('');
       seedsPath,
       outPath,
       limit,
+      chunkLimit,
       verbose,
       logger: console,
       seedMode: mode,

src/pipeline/pipeline_spec.mjs ADDED Viewed

	@@ -0,0 +1,52 @@

+// src/pipeline/pipeline_spec.mjs
+/**
+ * High-level pipeline description.
+ *
+ * This doesn’t execute anything; it just documents how
+ * stages should be wired together.
+ *
+ * You can later have a generic “orchestrator” that reads this
+ * and calls runLLMStage + parse functions dynamically.
+ */
+export const DISTILLATION_PIPELINE = {
+  id: 'confederation_distillation_v1',
+  stages: [
+    {
+      id: 'question',
+      kind: 'question_generator',
+      stageName: 'question',                 // maps to provider type
+      template: 'prompts/question_prompt.txt',
+      inputs: ['chunk'],                     // variables: { CONTEXT: chunk.content }
+      outputKey: 'questions',
+      parser: 'parseQuestions',              // from question_core.mjs
+    },
+    {
+      id: 'generator',
+      kind: 'answer_generator',
+      stageName: 'generator',
+      template: 'prompts/generator_prompt.txt',
+      inputs: ['question', 'chunk'],
+      outputKey: 'gen',
+      parser: 'parseGeneratorOutput',
+    },
+    {
+      id: 'verifier',
+      kind: 'verifier',
+      stageName: 'verifier',
+      template: 'prompts/verifier_prompt.txt',
+      inputs: ['question', 'chunk', 'gen'],
+      outputKey: 'ver',
+      parser: 'parseVerifierOutput',
+    },
+    {
+      id: 'reward',
+      kind: 'reward_model',
+      stageName: 'reward',
+      template: 'prompts/reward_prompt.txt',
+      inputs: ['question', 'chunk', 'gen', 'ver'],
+      outputKey: 'rew',
+      parser: 'parseRewardOutput',
+    },
+  ],
+};

src/pipeline/step.mjs CHANGED Viewed

@@ -1,6 +1,11 @@
 // src/pipeline/step.mjs
 import { loadProviderFor } from '../providers/provider.mjs';
-import { hybridSearch } from '../retrieval/retrieval.mjs';
 import { runGenerator } from '../generator/generator_core.mjs';
 import { runVerifier } from '../verifier/verifier_core.mjs';
 import { runReward } from '../reward/reward_core.mjs';
@@ -82,7 +87,21 @@ export async function runPipelineStep({
     // Go to ES exactly once
     try {
       if (verbose) log(`   [retrieval] mode=${retrievalMode} k=${k}`);
-      const hits = await hybridSearch(question, k);
       if (verbose) {
         log(`   [retrieval] got ${hits.length} chunks from ES`);
       }

 // src/pipeline/step.mjs
 import { loadProviderFor } from '../providers/provider.mjs';
+import {
+  hybridSearch,
+  bm25Search,
+  vectorSearch,
+  hydeHybrid,
+} from '../retrieval/retrieval.mjs';
 import { runGenerator } from '../generator/generator_core.mjs';
 import { runVerifier } from '../verifier/verifier_core.mjs';
 import { runReward } from '../reward/reward_core.mjs';
     // Go to ES exactly once
     try {
       if (verbose) log(`   [retrieval] mode=${retrievalMode} k=${k}`);
+      const hits = await (async () => {
+        switch (retrievalMode) {
+          case 'bm25':
+            return bm25Search(question, k);
+          case 'vector':
+            return vectorSearch(question, k);
+          case 'hyde':
+            return hydeHybrid(question, k, genProv);
+          case 'hybrid':
+          default:
+            return hybridSearch(question, k);
+        }
+      })();
       if (verbose) {
         log(`   [retrieval] got ${hits.length} chunks from ES`);
       }

src/question/question_core.mjs CHANGED Viewed

@@ -1,126 +1,230 @@
 // src/question/question_core.mjs
-import fs from 'fs/promises';
-import path from 'path';
-import { fileURLToPath } from 'url';
-const __filename = fileURLToPath(import.meta.url);
-const __dirname = path.dirname(__filename);
-const TEMPLATE_PATH = path.resolve(
-  __dirname,
-  '..',
-  '..',
-  'prompts',
-  'question_prompt.txt',
-);
-let cachedTemplate = null;
-async function loadQuestionTemplate() {
-  if (cachedTemplate) return cachedTemplate;
-  cachedTemplate = await fs.readFile(TEMPLATE_PATH, 'utf8');
-  return cachedTemplate;
 }
 /**
- * Extract questions using JSON-first, then plain-text fallback.
  *
- * @param {string} raw
- * @param {number} maxQuestions
- * @returns {{ questions: string[], parsed: any }}
  */
-function parseQuestions(raw, maxQuestions) {
-  let parsed = null;
-  let questions = [];
-  if (!raw || typeof raw !== 'string') {
-    return { questions, parsed };
   }
-  // ----- 1) Try JSON -----
-  try {
-    const json = JSON.parse(raw);
-    parsed = json;
-    // Case A: { questions: [...] }
-    if (json && Array.isArray(json.questions)) {
-      questions = json.questions
-        .map((q) => String(q).trim())
-        .filter((q) => q.length > 0);
-    }
-    // Case B: root is an array: [ "Q1?", "Q2?" ]
-    else if (Array.isArray(json)) {
-      questions = json
-        .map((q) => String(q).trim())
-        .filter((q) => q.length > 0);
     }
-  } catch (e) {
-    parsed = { error: 'invalid_json', message: e?.message };
   }
-  // ----- 2) Plain-text fallback if we still have no questions -----
-  if (!questions.length) {
-    const lines = raw
-      .split('\n')
-      .map((l) => l.trim())
-      // strip bullets / numbering: "1. ", "- ", "* ", "• "
-      .map((l) => l.replace(/^[-•*()\d.\s]+/, ''))
-      // keep lines that look like questions
-      .filter((l) => l.length > 0 && /[?？！]$/.test(l));
-    questions = lines;
   }
-  if (questions.length > maxQuestions) {
-    questions = questions.slice(0, maxQuestions);
   }
-  return { questions, parsed };
 }
 /**
- * Build prompt and generate questions from a context chunk.
- *
- * @param {string} contextText - chunk from ES
- * @param {object} provider    - { generate(prompt) → string }
- * @param {object} opts
- *   - maxQuestions?: number  (defaults QUESTION_MAX or 5)
  *
- * @returns {Promise<{
- *   raw: string,
- *   prompt: string,
- *   questions: string[],
- *   maxQuestions: number,
- *   parsed: any
- * }>}
  */
 export async function runQuestionGenerator(
   contextText,
   provider,
-  opts = {},
 ) {
-  const maxQuestions =
-    opts.maxQuestions ?? Number(process.env.QUESTION_MAX || '5');
-  const template = await loadQuestionTemplate();
-  const prompt = template
-    .replace(/{{CONTEXT}}/g, contextText)
-    .replace(/{{MAX_QUESTIONS}}/g, String(maxQuestions));
   const raw = await provider.generate(prompt);
-  const { questions, parsed } = parseQuestions(raw, maxQuestions);
-  return {
-    raw,
-    prompt,
-    questions,
-    maxQuestions,
-    parsed,
-  };
 }
-export default {
-  runQuestionGenerator,
-};

 // src/question/question_core.mjs
+import { preview } from '../pipeline/util.mjs';
+/**
+ * Safely parse JSON. Returns:
+ *   - a parsed value on success
+ *   - null on failure (and optionally an error object if needed)
+ */
+function tryParseJson(raw) {
+  if (!raw || typeof raw !== 'string') return null;
+  const trimmed = raw.trim();
+  // Quick sanity: must start with { or [
+  if (!trimmed.startsWith('{') && !trimmed.startsWith('[')) {
+    return null;
+  }
+  try {
+    return JSON.parse(trimmed);
+  } catch {
+    return null;
+  }
 }
 /**
+ * Extract questions from a plain-text response.
+ *
+ * This is designed to handle real LLM outputs like:
+ *
+ *   What is the primary purpose of practicing presence according to the text?
+ *   How does Q'uo characterize the physical vehicle's limitations?
+ *   What is the role of pain and struggle in spiritual growth?
+ *
+ * as well as numbered/bulleted lists:
  *
+ *   1. What is ... ?
+ *   - How does ... ?
+ *   * Why is ... ?
  */
+function extractQuestionsFromText(rawText) {
+  if (!rawText || typeof rawText !== 'string') return [];
+  // Strip trivial XML/HTML-ish tags like <analysis>, <reasoning>, etc.
+  const stripped = rawText.replace(/<\/?[a-zA-Z0-9_:-]+>/g, ' ');
+  const lines = stripped
+    .split(/\r?\n/)
+    .map((l) => l.trim())
+    .filter(Boolean);
+  const questions = [];
+  for (const line of lines) {
+    // Must contain a question mark somewhere
+    if (!line.includes('?')) continue;
+    // Common prefixes: "1. ", "1) ", "- ", "* "
+    const cleaned = line.replace(/^(?:\d+\s*[.)]\s*|[-*]\s*)/, '').trim();
+    // Take up to the first '?' as the end of the question
+    const qPart = cleaned.split('?')[0].trim();
+    if (!qPart) continue;
+    const q = (qPart + '?').trim();
+    // Filter out tiny or degenerate things
+    if (q.length < 10) continue;
+    if (!/[a-zA-Z]/.test(q)) continue;
+    questions.push(q);
   }
+  // If we didn't find anything line-based, optional fallback:
+  // try to split the whole text by '?' and recover sentence-like chunks.
+  if (questions.length === 0) {
+    const segments = stripped.split('?');
+    for (let i = 0; i < segments.length - 1; i++) {
+      const seg = segments[i].trim();
+      if (!seg) continue;
+      // Consider only reasonable-length segments
+      if (seg.length < 10) continue;
+      const candidate = seg + '?';
+      if (!/[a-zA-Z]/.test(candidate)) continue;
+      questions.push(candidate);
     }
   }
+  // Deduplicate while preserving order
+  const seen = new Set();
+  const deduped = [];
+  for (const q of questions) {
+    if (seen.has(q)) continue;
+    seen.add(q);
+    deduped.push(q);
   }
+  return deduped;
+}
+/**
+ * Core helper: take raw model string and return:
+ *   {
+ *     questions: string[],
+ *     raw: string,
+ *     parsed: any | { error: 'invalid_json', rawSnippet?: string }
+ *   }
+ *
+ * - Tries JSON first: { questions: [...] } or [...] array root.
+ * - If JSON fails, falls back to text-based extraction.
+ */
+export function parseQuestionResponse(raw, { maxQuestions } = {}) {
+  const result = {
+    questions: [],
+    raw: raw ?? '',
+    parsed: null,
+  };
+  if (!raw || typeof raw !== 'string') {
+    result.parsed = { error: 'empty_response' };
+    return result;
   }
+  const parsed = tryParseJson(raw);
+  if (parsed != null) {
+    result.parsed = parsed;
+    // Case 1: { questions: [...] }
+    if (
+      parsed &&
+      typeof parsed === 'object' &&
+      Array.isArray(parsed.questions)
+    ) {
+      const qs = parsed.questions
+        .map((q) => (typeof q === 'string' ? q.trim() : ''))
+        .filter((q) => q && q.endsWith('?'));
+      result.questions = maxQuestions
+        ? qs.slice(0, maxQuestions)
+        : qs;
+      return result;
+    }
+    // Case 2: array root
+    if (Array.isArray(parsed)) {
+      const qs = parsed
+        .map((item) => {
+          if (typeof item === 'string') return item.trim();
+          if (item && typeof item === 'object') {
+            if (typeof item.question === 'string') {
+              return item.question.trim();
+            }
+            if (typeof item.question_text === 'string') {
+              return item.question_text.trim();
+            }
+          }
+          return '';
+        })
+        .filter((q) => q && q.endsWith('?'));
+      result.questions = maxQuestions
+        ? qs.slice(0, maxQuestions)
+        : qs;
+      return result;
+    }
+    // Parsed JSON but not in a recognized shape
+    result.parsed = {
+      error: 'unrecognized_json_shape',
+      rawSnippet: preview(raw, 200),
+    };
+  } else {
+    // Not valid JSON at all
+    result.parsed = {
+      error: 'invalid_json',
+      rawSnippet: preview(raw, 200),
+    };
+  }
+  // Fallback: extract questions from plain text
+  const textQs = extractQuestionsFromText(raw);
+  result.questions = maxQuestions
+    ? textQs.slice(0, maxQuestions)
+    : textQs;
+  return result;
 }
 /**
+ * High-level helper used by the pipeline:
  *
+ *   const { questions, raw, parsed } = await runQuestionGenerator(contextText, provider, { maxQuestions })
  */
 export async function runQuestionGenerator(
   contextText,
   provider,
+  { maxQuestions = 5 } = {},
 ) {
+  if (!provider || typeof provider.generate !== 'function') {
+    throw new Error('Question provider must implement .generate(prompt)');
+  }
+  if (!contextText || !contextText.trim()) {
+    return { questions: [], raw: '', parsed: { error: 'empty_context' } };
+  }
+  // Minimal built-in prompt; if you have a richer prompt file, you can
+  // load it and inject {{CONTEXT}} before calling provider.generate.
+  const prompt = [
+    'You are a question generation assistant.',
+    '',
+    'You will be given a chunk of spiritual teaching text as CONTEXT.',
+    'Generate diverse, high-quality questions that:',
+    '- are answerable from the context only,',
+    '- require some thinking, not just copying a sentence,',
+    '- are phrased as clear, direct questions.',
+    '',
+    'Return either:',
+    '- JSON: { "questions": ["Q1?", "Q2?", ...] }',
+    '  or an array of question-like objects/strings; OR',
+    '- Plain text with one question per line.',
+    '',
+    '---',
+    'CONTEXT:',
+    contextText,
+    '---',
+  ].join('\n');
   const raw = await provider.generate(prompt);
+  const parsed = parseQuestionResponse(raw, { maxQuestions });
+  return parsed;
 }

src/retrieval/jsonl_chunks.mjs ADDED Viewed

	@@ -0,0 +1,105 @@

+// src/retrieval/jsonl_chunks.mjs
+import fs from 'fs/promises';
+import path from 'path';
+import crypto from 'crypto';
+import { PROJECT_ROOT } from '../pipeline/util.mjs';
+const DEFAULT_RAG_PATH = path.join(
+  PROJECT_ROOT,
+  'data',
+  'rag_chunks.jsonl',
+);
+// simple in-memory cache of parsed chunks
+let cachedChunks = null;
+/**
+ * Parse rag_chunks.jsonl into an array of
+ * { id, content, source } records.
+ *
+ * We are deliberately tolerant about field names so this works
+ * with different builders:
+ *  - content:  obj.content || obj.text || obj.chunk || ''
+ *  - id:       obj.id || obj.session_key || obj.title || `jsonl-${idx}`
+ *  - source:   whole original object
+ */
+async function loadAllChunksFromJsonl(filePath = DEFAULT_RAG_PATH) {
+  if (cachedChunks) return cachedChunks;
+  const absPath = path.isAbsolute(filePath)
+    ? filePath
+    : path.join(PROJECT_ROOT, filePath);
+  const raw = await fs.readFile(absPath, 'utf8');
+  const lines = raw
+    .split('\n')
+    .map((l) => l.trim())
+    .filter(Boolean);
+  const chunks = lines.map((line, idx) => {
+    let obj;
+    try {
+      obj = JSON.parse(line);
+    } catch (e) {
+      // Skip bad lines instead of exploding
+      return null;
+    }
+    const content =
+      obj.content ||
+      obj.text ||
+      obj.chunk ||
+      obj.body ||
+      '';
+    const id =
+      obj.id ||
+      obj.session_key ||
+      obj.title ||
+      `jsonl-${idx}`;
+    return {
+      id,
+      content,
+      source: obj,
+    };
+  });
+  cachedChunks = chunks.filter(Boolean);
+  return cachedChunks;
+}
+/**
+ * Hardware-random sampling without replacement using crypto.randomInt.
+ * `k >= n` ⇒ returns full array.
+ */
+function sampleWithoutReplacement(arr, k) {
+  const n = arr.length;
+  if (k == null || k >= n) return arr.slice();
+  const chosen = new Set();
+  const out = [];
+  while (out.length < k && chosen.size < n) {
+    const idx = crypto.randomInt(0, n);
+    if (chosen.has(idx)) continue;
+    chosen.add(idx);
+    out.push(arr[idx]);
+  }
+  return out;
+}
+/**
+ * Public API: load RAG chunks for pipeline seeding.
+ *
+ * @param {number|undefined} limit  Max chunks to return
+ * @param {string|undefined} filePath Override path (defaults to env or data/rag_chunks.jsonl)
+ * @returns {Promise<Array<{id, content, source}>>}
+ */
+export async function loadRagChunks(limit, filePath) {
+  const envPath = process.env.RAG_CHUNKS_PATH;
+  const chunks = await loadAllChunksFromJsonl(filePath || envPath || DEFAULT_RAG_PATH);
+  if (!chunks || chunks.length === 0) return [];
+  return sampleWithoutReplacement(chunks, limit ?? chunks.length);
+}

src/retrieval/retrieval.mjs CHANGED Viewed

@@ -143,8 +143,14 @@ Do NOT include JSON or formatting.
 // ----------------------------------------
 // Chunk sampling from ES (for QG pipeline)
 // ----------------------------------------
-export async function fetchChunksFromIndex(limit = 10) {
-  const size = Number.isFinite(limit) ? limit : 10;
   const res = await client.search({
     index: ES_INDEX,
@@ -176,4 +182,3 @@ export default {
   hydeHybrid,
   fetchChunksFromIndex,
 };

 // ----------------------------------------
 // Chunk sampling from ES (for QG pipeline)
 // ----------------------------------------
+const DEFAULT_CHUNK_LIMIT = Number(
+  process.env.PIPELINE_CHUNK_LIMIT ||
+    process.env.RETRIEVAL_CHUNK_LIMIT ||
+    '100',
+);
+export async function fetchChunksFromIndex(limit) {
+  const size = Number.isFinite(limit) ? limit : DEFAULT_CHUNK_LIMIT;
   const res = await client.search({
     index: ES_INDEX,
   hydeHybrid,
   fetchChunksFromIndex,
 };

tests/llm_stage.test.mjs ADDED Viewed

	@@ -0,0 +1,58 @@

+// tests/llm_stage.test.mjs
+import { describe, it, expect, vi } from 'vitest';
+import { runLLMStage } from '../src/core/llm_stage.mjs';
+describe('runLLMStage (core LLM wrapper)', () => {
+  it('fills the template, calls provider.generate, and returns raw + prompt', async () => {
+    const fakeProvider = {
+      generate: vi.fn(async (prompt) => `ECHO::${prompt.slice(0, 40)}`),
+    };
+    const question = 'What is love?';
+    const context = 'Love is the field in which all beings move.';
+    const { raw, prompt } = await runLLMStage({
+      stage: 'generator',
+      // Use an existing prompt template so we’re exercising real I/O
+      template: 'prompts/generator_prompt.txt',
+      vars: {
+        QUESTION: question,
+        CONTEXT: context,
+      },
+      provider: fakeProvider, // avoid hitting Ollama in tests
+      verbose: false,
+      logger: { log: () => {} },
+    });
+    // Provider was called exactly once
+    expect(fakeProvider.generate).toHaveBeenCalledTimes(1);
+    const calledPrompt = fakeProvider.generate.mock.calls[0][0];
+    // The rendered prompt should contain our substituted vars
+    expect(calledPrompt).toContain(question);
+    expect(calledPrompt).toContain(context);
+    // Returned prompt should match what we sent to the provider
+    expect(prompt).toBe(calledPrompt);
+    // raw should be whatever the provider returned
+    expect(raw).toBe(`ECHO::${calledPrompt.slice(0, 40)}`);
+  });
+  it('throws if provider has no generate() method', async () => {
+    await expect(
+      runLLMStage({
+        stage: 'generator',
+        template: 'prompts/generator_prompt.txt',
+        vars: {
+          QUESTION: 'Test?',
+          CONTEXT: 'Some context',
+        },
+        provider: {}, // missing generate()
+        verbose: false,
+        logger: { log: () => {} },
+      }),
+    ).rejects.toThrow(/generate/i);
+  });
+});

tests/ollama_provider_reasoning.test.mjs ADDED Viewed

	@@ -0,0 +1,54 @@

+// tests/ollama_provider_reasoning.test.mjs
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+let originalFetch;
+let originalEnv;
+beforeEach(() => {
+  originalFetch = globalThis.fetch;
+  // shallow clone is fine for test mutations
+  originalEnv = { ...process.env };
+});
+afterEach(() => {
+  globalThis.fetch = originalFetch;
+  process.env = originalEnv;
+});
+describe('OllamaProvider reasoning flag', () => {
+  it('adds options.reasoning=true when OLLAMA_REASONING is enabled', async () => {
+    // Arrange env *before* importing the module (ENABLE_REASONING is computed at import time)
+    process.env.OLLAMA_REASONING = 'true';
+    process.env.OLLAMA_URL = 'http://ollama.local';
+    process.env.GENERATOR_MODEL = 'test-model';
+    const fetchMock = vi.fn(async () => ({
+      ok: true,
+      json: async () => ({ response: 'ok' }),
+    }));
+    globalThis.fetch = fetchMock;
+    // Dynamic import so env is read fresh for this test
+    const { OllamaProvider } = await import(
+      '../src/providers/ollama_provider.mjs'
+    );
+    const provider = new OllamaProvider({ model: 'test-model' });
+    // Act
+    await provider.generate('Hello, world');
+    // Assert
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+    const [url, options] = fetchMock.mock.calls[0];
+    expect(url).toMatch(/\/api\/generate$/);
+    const body = JSON.parse(options.body);
+    expect(body).toHaveProperty('model', 'test-model');
+    expect(body).toHaveProperty('prompt');
+    expect(body).toHaveProperty('options');
+    expect(body.options).toHaveProperty('reasoning', true);
+  });
+});

tests/pipeline_behaviour.test.mjs ADDED Viewed

	@@ -0,0 +1,182 @@

+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import os from 'os';
+import path from 'path';
+describe('runPipelineStep retrieval modes', () => {
+  beforeEach(() => {
+    vi.resetModules();
+  });
+  afterEach(() => {
+    vi.restoreAllMocks();
+    vi.unmock('../src/retrieval/retrieval.mjs');
+    vi.unmock('../src/providers/provider.mjs');
+    vi.unmock('../src/generator/generator_core.mjs');
+    vi.unmock('../src/verifier/verifier_core.mjs');
+    vi.unmock('../src/reward/reward_core.mjs');
+  });
+  async function setupRetrievalMocks() {
+    const hybridSearch = vi.fn(async () => [{ content: 'hybrid' }]);
+    const bm25Search = vi.fn(async () => [{ content: 'bm25' }]);
+    const vectorSearch = vi.fn(async () => [{ content: 'vector' }]);
+    const hydeHybrid = vi.fn(async () => [{ content: 'hyde' }]);
+    vi.doMock('../src/retrieval/retrieval.mjs', () => ({
+      hybridSearch,
+      bm25Search,
+      vectorSearch,
+      hydeHybrid,
+    }));
+    vi.doMock('../src/providers/provider.mjs', () => ({
+      loadProviderFor: () => ({ generate: async () => 'ok' }),
+    }));
+    vi.doMock('../src/generator/generator_core.mjs', () => ({
+      runGenerator: vi.fn(async () => ({ answer: 'a', raw: 'a' })),
+    }));
+    vi.doMock('../src/verifier/verifier_core.mjs', () => ({
+      runVerifier: vi.fn(async () => ({ ok: true, raw: 'yes' })),
+    }));
+    vi.doMock('../src/reward/reward_core.mjs', () => ({
+      runReward: vi.fn(async () => ({ ok: true, score: 1, raw: '1' })),
+    }));
+    const mod = await import('../src/pipeline/step.mjs');
+    return { hybridSearch, bm25Search, vectorSearch, hydeHybrid, runPipelineStep: mod.runPipelineStep };
+  }
+  it('uses bm25 when retrievalMode=bm25', async () => {
+    const mocks = await setupRetrievalMocks();
+    await mocks.runPipelineStep({ question: 'q', retrievalMode: 'bm25' });
+    expect(mocks.bm25Search).toHaveBeenCalledTimes(1);
+    expect(mocks.vectorSearch).not.toHaveBeenCalled();
+    expect(mocks.hybridSearch).not.toHaveBeenCalled();
+    expect(mocks.hydeHybrid).not.toHaveBeenCalled();
+  });
+  it('uses hyde when retrievalMode=hyde', async () => {
+    const mocks = await setupRetrievalMocks();
+    await mocks.runPipelineStep({ question: 'q', retrievalMode: 'hyde' });
+    expect(mocks.hydeHybrid).toHaveBeenCalledTimes(1);
+    expect(mocks.bm25Search).not.toHaveBeenCalled();
+    expect(mocks.vectorSearch).not.toHaveBeenCalled();
+    expect(mocks.hybridSearch).not.toHaveBeenCalled();
+  });
+});
+describe('runPipelineBatch question cap', () => {
+  beforeEach(() => {
+    vi.resetModules();
+    process.env.PIPELINE_CHUNK_SOURCE = 'jsonl';
+  });
+  afterEach(() => {
+    delete process.env.PIPELINE_CHUNK_SOURCE;
+    vi.restoreAllMocks();
+    vi.unmock('../src/providers/provider.mjs');
+    vi.unmock('../src/retrieval/jsonl_chunks.mjs');
+    vi.unmock('../src/pipeline/step.mjs');
+  });
+  it('stops once the question limit is reached in question-first mode', async () => {
+    const questionsPerChunk = 'Q1?\nQ2?\nQ3?';
+    // Mock question provider + retrieval
+    vi.doMock('../src/providers/provider.mjs', () => ({
+      loadProviderFor: (stage) =>
+        stage === 'question'
+          ? { generate: async () => questionsPerChunk }
+          : { generate: async () => '' },
+    }));
+    // Force question generator to return a fixed list to avoid parser variability
+    const runQuestionGenerator = vi.fn(async () => ({
+      questions: ['Q1?', 'Q2?', 'Q3?'],
+      raw: questionsPerChunk,
+      parsed: null,
+    }));
+    vi.doMock('../src/question/question_core.mjs', () => ({ runQuestionGenerator }));
+    vi.doMock('../src/retrieval/jsonl_chunks.mjs', () => ({
+      loadRagChunks: vi.fn(async (limit) =>
+        Array.from({ length: limit ?? 3 }, (_, i) => ({
+          id: `c-${i}`,
+          content: `chunk ${i}`,
+        })),
+      ),
+    }));
+    // Stub runPipelineStep to avoid model calls; count invocations
+    const runPipelineStep = vi.fn(async () => ({
+      status: 'accepted',
+      context: [],
+      gen: { answer: 'a' },
+      ver: { ok: true },
+      rew: { ok: true },
+    }));
+    vi.doMock('../src/pipeline/step.mjs', () => ({ runPipelineStep }));
+    const outPath = path.join(os.tmpdir(), `test-gold-${Date.now()}.jsonl`);
+    const { runPipelineBatch } = await import('../src/pipeline/batch.mjs');
+    const result = await runPipelineBatch({
+      limit: 2, // question cap
+      chunkLimit: 3,
+      seedMode: 'question-first',
+      outPath,
+      verbose: false,
+      logger: { log() {}, error() {} },
+    });
+    expect(result.processed).toBe(2);
+    expect(runPipelineStep).toHaveBeenCalledTimes(2);
+    expect(result.accepted).toBe(2);
+  });
+});
+describe('fetchChunksFromIndex default size', () => {
+  beforeEach(() => {
+    vi.resetModules();
+    vi.unmock('../src/retrieval/retrieval.mjs');
+  });
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+  it('uses a larger default than 10 when no limit is provided', async () => {
+    let capturedOpts = null;
+    vi.unmock('../src/retrieval/retrieval.mjs');
+    vi.doMock('@elastic/elasticsearch', () => ({
+      Client: vi.fn().mockImplementation(() => ({
+        search: vi.fn(async (opts) => {
+          capturedOpts = opts;
+          return { hits: { hits: [] } };
+        }),
+      })),
+    }));
+    // avoid accidental network fetch calls in this test
+    vi.doMock('node-fetch', () => ({
+      default: vi.fn(async () => ({ ok: true, json: async () => ({ embedding: [] }) })),
+    }));
+    // Re-export the real module so the fetch function exists, but with mocked deps above.
+    vi.doMock('../src/retrieval/retrieval.mjs', async (importOriginal) => {
+      const actual = await importOriginal();
+      return { ...actual };
+    });
+    const { fetchChunksFromIndex } = await import('../src/retrieval/retrieval.mjs');
+    await fetchChunksFromIndex();
+    expect(capturedOpts?.size).toBe(100);
+  });
+});