added intermediate storage of chunks

Browse files

Files changed (14) hide show

AGENTS.md +3 -0
data/cache/generations.jsonl +6 -0
data/cache/questions.jsonl +4 -0
data/cache/rewards.jsonl +6 -0
data/cache/verifications.jsonl +6 -0
src/pipeline/batch.mjs +102 -11
src/pipeline/cache.mjs +177 -0
src/pipeline/ids.mjs +24 -0
src/pipeline/step.mjs +21 -15
src/retrieval/jsonl_chunks.mjs +5 -1
src/retrieval/retrieval.mjs +16 -8
tests/cache_pipeline.test.mjs +83 -0
tests/pipeline.full.mock.test.mjs +5 -0
tests/pipeline_behaviour.test.mjs +5 -0

AGENTS.md CHANGED Viewed

@@ -5,6 +5,7 @@
 - Prompts are in `prompts/`; tweak these before changing stage behaviour.
 - Tests sit in `tests/` (Vitest), with sample seeds in `test_samples/`; pipeline outputs write to `gold/`.
 - Config baselines (models, limits) are in `configs/pipeline.json`; run scripts live at the repo root (`run.sh`, `try_prompt.sh`).
 ## Build, Test, and Development Commands
 - `npm install` – install dependencies.
@@ -12,12 +13,14 @@
 - `PIPELINE_SEED_MODE=question-first npm run pipeline -- --limit 20 --verbose` – enable question-first seeding.
 - `npm test` – run all unit tests (mocked by default).
 - `REAL_ES=1 npm test` – exercise retrieval against a live Elasticsearch + embedding endpoint.
 ## Coding Style & Naming Conventions
 - ECMAScript modules (`type: "module"`); prefer `.mjs` for shared code.
 - Two-space indentation, single quotes unless template strings add clarity, and keep functions small and pure where possible (CLI glue stays in `pipeline_cli.js`).
 - Use descriptive, lower_snake or camelCase for variables; exported helpers use camelCase.
 - Keep prompts and stage logic separate; place reusable utilities in `src/pipeline/util.mjs`.
 ## Testing Guidelines
 - Vitest is the test runner; add new tests under `tests/` with `.test.mjs` suffix.

 - Prompts are in `prompts/`; tweak these before changing stage behaviour.
 - Tests sit in `tests/` (Vitest), with sample seeds in `test_samples/`; pipeline outputs write to `gold/`.
 - Config baselines (models, limits) are in `configs/pipeline.json`; run scripts live at the repo root (`run.sh`, `try_prompt.sh`).
+- Cached intermediates (questions/gens/verifications/rewards) live in `data/cache/*.jsonl`; set `PIPELINE_CACHE_DIR` to redirect.
 ## Build, Test, and Development Commands
 - `npm install` – install dependencies.
 - `PIPELINE_SEED_MODE=question-first npm run pipeline -- --limit 20 --verbose` – enable question-first seeding.
 - `npm test` – run all unit tests (mocked by default).
 - `REAL_ES=1 npm test` – exercise retrieval against a live Elasticsearch + embedding endpoint.
+- Red/green pathway: use `*_PROVIDER=mock` plus JSONL chunk source to dry-run (green) without models; switch to real providers for red runs and the cache will skip already-completed stages.
 ## Coding Style & Naming Conventions
 - ECMAScript modules (`type: "module"`); prefer `.mjs` for shared code.
 - Two-space indentation, single quotes unless template strings add clarity, and keep functions small and pure where possible (CLI glue stays in `pipeline_cli.js`).
 - Use descriptive, lower_snake or camelCase for variables; exported helpers use camelCase.
 - Keep prompts and stage logic separate; place reusable utilities in `src/pipeline/util.mjs`.
+- Deterministic IDs: chunks are hashed from content+source; questions/gens/rewards are keyed in JSONL caches so reruns can skip already-processed work.
 ## Testing Guidelines
 - Vitest is the test runner; add new tests under `tests/` with `.test.mjs` suffix.

data/cache/generations.jsonl ADDED Viewed

	@@ -0,0 +1,6 @@

+{"chunk_id":"chunk-1","question_id":"a85338a842d7801079bbbc9d5cab2665c5cf65fda9acdc73b2151994e73555bd","gen_id":"fe108065f6f7a4f099d4687547a35803133e8a72d110560b0d6f6dc3e92f945d","answer":"Here is a grounded answer.","thought":"mock reasoning","raw":"<think>mock reasoning</think>Here is a grounded answer.","provider":"mock","ts":1763950116519}
+{"chunk_id":"chunk-1","question_id":"e8b843bd798e1617421c699c4b9cab3689e52d013c314481a5255917fcd19958","gen_id":"23079eb060c27259b66fec6e0cd66140b8c65c731b6f2724000e49c675e382c2","answer":"Here is a grounded answer.","thought":"mock reasoning","raw":"<think>mock reasoning</think>Here is a grounded answer.","provider":"mock","ts":1763950116521}
+{"chunk_id":"c-0","question_id":"d7bd2353ef09a9061f7d6757ab0d9453e6ff9b735212ea69984cf541a3e506c7","gen_id":"fec93133bc62fb9bd895d88deec5afef39f9d5858997b0ccf1f8826868c758e7","answer":"a","provider":"ollama","ts":1763950116529}
+{"chunk_id":"chunk-2","question_id":"5bb2bda8a2e19f3b77bcdd33f54e068df86b1dce9d82d1046438447d5908cbde","gen_id":"3cfe24cc427220e94ff010c167776baf27a9650bc47294a21c7715e89b01cda8","answer":"Here is a grounded answer.","thought":"mock reasoning","raw":"<think>mock reasoning</think>Here is a grounded answer.","provider":"mock","ts":1763950116529}
+{"chunk_id":"c-0","question_id":"61ae2d8d90d176a1a499aa6b965c3729f6a596a345b0926441f86845c36a56d6","gen_id":"c3f3f96ec39323a48b373ca251675ee6904c41333bc6d4612c8cdd1a4b9163f3","answer":"a","provider":"ollama","ts":1763950116534}
+{"chunk_id":"c1","question_id":"d159888d476c85a6459fe892d359efeebb8821ee5ec32e4cfa0f3ea19c25e7e9","gen_id":"7ba5bdcd55087da7ece639dbf4875a61ef67f8740cd851b071aae05daa3fb871","answer":"a","provider":"ollama","model":"qwen3-vl:8b-thinking","ts":1763950148198}

data/cache/questions.jsonl ADDED Viewed

	@@ -0,0 +1,4 @@

+{"chunk_id":"chunk-1","questions":["What is the main idea?","How does the text justify its claim?"],"question_ids":["a85338a842d7801079bbbc9d5cab2665c5cf65fda9acdc73b2151994e73555bd","e8b843bd798e1617421c699c4b9cab3689e52d013c314481a5255917fcd19958"],"provider":"mock","ts":1763950116516}
+{"chunk_id":"chunk-2","questions":["What is the main idea?","How does the text justify its claim?"],"question_ids":["5bb2bda8a2e19f3b77bcdd33f54e068df86b1dce9d82d1046438447d5908cbde","53f4bb043a43683a68a25a3bb34f3170c362ea5583114ed1222d46011b8d9d5c"],"provider":"mock","ts":1763950116524}
+{"chunk_id":"c-0","questions":["Q1?","Q2?","Q3?"],"question_ids":["d7bd2353ef09a9061f7d6757ab0d9453e6ff9b735212ea69984cf541a3e506c7","61ae2d8d90d176a1a499aa6b965c3729f6a596a345b0926441f86845c36a56d6","fee11e7734240ba47650e2e7b8d71ecb626d2777361138c2fdc033b61fd392fa"],"ts":1763950116525}
+{"chunk_id":"c1","questions":["Q1?","Q2?","Q3?"],"question_ids":["d159888d476c85a6459fe892d359efeebb8821ee5ec32e4cfa0f3ea19c25e7e9","96e999c82e3f02db378e52ca4b50a04a979e9cf303057917dfe21370cc15db01","46a8298c8dc8564e0240a018963d5a3ce2bda1ca75fe367ee45e7c822296f3c9"],"provider":"ollama","model":"qwen3-vl:8b-thinking","ts":1763950148197}

data/cache/rewards.jsonl ADDED Viewed

	@@ -0,0 +1,6 @@

+{"chunk_id":"chunk-1","question_id":"a85338a842d7801079bbbc9d5cab2665c5cf65fda9acdc73b2151994e73555bd","gen_id":"fe108065f6f7a4f099d4687547a35803133e8a72d110560b0d6f6dc3e92f945d","score":0.9,"ok":true,"raw":"0.9 good","provider":"mock","ts":1763950116520}
+{"chunk_id":"chunk-1","question_id":"e8b843bd798e1617421c699c4b9cab3689e52d013c314481a5255917fcd19958","gen_id":"23079eb060c27259b66fec6e0cd66140b8c65c731b6f2724000e49c675e382c2","score":0.9,"ok":true,"raw":"0.9 good","provider":"mock","ts":1763950116521}
+{"chunk_id":"chunk-2","question_id":"5bb2bda8a2e19f3b77bcdd33f54e068df86b1dce9d82d1046438447d5908cbde","gen_id":"3cfe24cc427220e94ff010c167776baf27a9650bc47294a21c7715e89b01cda8","score":0.9,"ok":true,"raw":"0.9 good","provider":"mock","ts":1763950116530}
+{"chunk_id":"c-0","question_id":"d7bd2353ef09a9061f7d6757ab0d9453e6ff9b735212ea69984cf541a3e506c7","gen_id":"fec93133bc62fb9bd895d88deec5afef39f9d5858997b0ccf1f8826868c758e7","ok":true,"provider":"ollama","ts":1763950116530}
+{"chunk_id":"c-0","question_id":"61ae2d8d90d176a1a499aa6b965c3729f6a596a345b0926441f86845c36a56d6","gen_id":"c3f3f96ec39323a48b373ca251675ee6904c41333bc6d4612c8cdd1a4b9163f3","ok":true,"provider":"ollama","ts":1763950116535}
+{"chunk_id":"c1","question_id":"d159888d476c85a6459fe892d359efeebb8821ee5ec32e4cfa0f3ea19c25e7e9","gen_id":"7ba5bdcd55087da7ece639dbf4875a61ef67f8740cd851b071aae05daa3fb871","ok":true,"provider":"ollama","model":"tensortemplar/patronus-lynx:8b-instruct-q4_K_M","ts":1763950148199}

data/cache/verifications.jsonl ADDED Viewed

	@@ -0,0 +1,6 @@

+{"chunk_id":"chunk-1","question_id":"a85338a842d7801079bbbc9d5cab2665c5cf65fda9acdc73b2151994e73555bd","gen_id":"fe108065f6f7a4f099d4687547a35803133e8a72d110560b0d6f6dc3e92f945d","ok":true,"raw":"yes\nmock verifier rationale","provider":"mock","ts":1763950116519}
+{"chunk_id":"chunk-1","question_id":"e8b843bd798e1617421c699c4b9cab3689e52d013c314481a5255917fcd19958","gen_id":"23079eb060c27259b66fec6e0cd66140b8c65c731b6f2724000e49c675e382c2","ok":true,"raw":"yes\nmock verifier rationale","provider":"mock","ts":1763950116521}
+{"chunk_id":"c-0","question_id":"d7bd2353ef09a9061f7d6757ab0d9453e6ff9b735212ea69984cf541a3e506c7","gen_id":"fec93133bc62fb9bd895d88deec5afef39f9d5858997b0ccf1f8826868c758e7","ok":true,"provider":"ollama","ts":1763950116529}
+{"chunk_id":"chunk-2","question_id":"5bb2bda8a2e19f3b77bcdd33f54e068df86b1dce9d82d1046438447d5908cbde","gen_id":"3cfe24cc427220e94ff010c167776baf27a9650bc47294a21c7715e89b01cda8","ok":true,"raw":"yes\nmock verifier rationale","provider":"mock","ts":1763950116530}
+{"chunk_id":"c-0","question_id":"61ae2d8d90d176a1a499aa6b965c3729f6a596a345b0926441f86845c36a56d6","gen_id":"c3f3f96ec39323a48b373ca251675ee6904c41333bc6d4612c8cdd1a4b9163f3","ok":true,"provider":"ollama","ts":1763950116535}
+{"chunk_id":"c1","question_id":"d159888d476c85a6459fe892d359efeebb8821ee5ec32e4cfa0f3ea19c25e7e9","gen_id":"7ba5bdcd55087da7ece639dbf4875a61ef67f8740cd851b071aae05daa3fb871","ok":true,"provider":"ollama","model":"tensortemplar/patronus-lynx:8b-instruct-q4_K_M","ts":1763950148198}

src/pipeline/batch.mjs CHANGED Viewed

@@ -14,6 +14,19 @@ import { loadProviderFor } from '../providers/provider.mjs';
 import { runQuestionGenerator } from '../question/question_core.mjs';
 import { fetchChunksFromIndex } from '../retrieval/retrieval.mjs';
 import { loadRagChunks } from '../retrieval/jsonl_chunks.mjs';
 /**
  * Append a single accepted record to a JSONL file.
@@ -142,6 +155,12 @@ export async function runPipelineBatch({
   // ----------------------------------------
   if (seedMode === 'question-first') {
     const questionProvider = loadProviderFor('question');
     const maxQuestionsPerChunk = Number(
       process.env.QUESTION_MAX_PER_CHUNK ||
@@ -212,6 +231,14 @@ export async function runPipelineBatch({
       const label = `[chunk ${idx + 1}/${chunks.length}]`;
       const contextText = chunk.content;
       if (!contextText || !contextText.trim()) {
         if (verbose) {
           log(`${label} chunk content empty, skipping`);
@@ -223,9 +250,7 @@ export async function runPipelineBatch({
       if (verbose) {
         log(`\n🧩 ${label} generating questions from chunk…`);
-        if (chunk.id) {
-          log(`   [question] chunk id: ${chunk.id}`);
-        }
         log(
           '   [question] chunk preview:\n   ' +
             preview(contextText, 300).replace(/\n/g, '\n   '),
@@ -235,14 +260,34 @@ export async function runPipelineBatch({
         );
       }
-      // 1) generate questions from this chunk
       let qResult;
       try {
-        qResult = await runQuestionGenerator(
-          contextText,
-          questionProvider,
-          { maxQuestions: maxQuestionsPerChunk },
-        );
       } catch (e) {
         const msg = e?.message || String(e);
         statusCounts.question_error =
@@ -253,8 +298,6 @@ export async function runPipelineBatch({
         continue;
       }
-      const questions = qResult?.questions || [];
       if (verbose) {
         log(
           `   [question] generated ${questions.length} question(s) from this chunk`,
@@ -273,6 +316,22 @@ export async function runPipelineBatch({
         if (processed >= questionCap) break;
         if (!q || !q.trim()) continue;
         const qLabel = `[q ${processed + 1}]`;
         log(
           `   → ${qLabel} Running pipeline for generated question: "${q}"`,
@@ -282,6 +341,7 @@ export async function runPipelineBatch({
           const result = await runPipelineStep({
             question: q,
             initialContext: [chunk], // IMPORTANT: reuse SAME chunk, no second retrieval
             verbose,
             logger,
           });
@@ -294,6 +354,37 @@ export async function runPipelineBatch({
             log(`     ↳ status: ${result.status}`);
           }
           if (result.status === 'accepted') {
             const record = {
               question: q,

 import { runQuestionGenerator } from '../question/question_core.mjs';
 import { fetchChunksFromIndex } from '../retrieval/retrieval.mjs';
 import { loadRagChunks } from '../retrieval/jsonl_chunks.mjs';
+import {
+  chunkIdFromContent,
+  questionId,
+} from './cache.mjs';
+import {
+  getCachedQuestions,
+  saveQuestions,
+  getCachedGeneration,
+  saveGeneration,
+  getCachedReward,
+  saveReward,
+  saveVerification,
+} from './cache.mjs';
 /**
  * Append a single accepted record to a JSONL file.
   // ----------------------------------------
   if (seedMode === 'question-first') {
     const questionProvider = loadProviderFor('question');
+    const generatorProviderName =
+      process.env.GENERATOR_PROVIDER || 'ollama';
+    const verifierProviderName =
+      process.env.VERIFIER_PROVIDER || generatorProviderName;
+    const rewardProviderName =
+      process.env.REWARD_PROVIDER || generatorProviderName;
     const maxQuestionsPerChunk = Number(
       process.env.QUESTION_MAX_PER_CHUNK ||
       const label = `[chunk ${idx + 1}/${chunks.length}]`;
       const contextText = chunk.content;
+      const stableChunkId =
+        chunk.id ||
+        chunkIdFromContent(
+          contextText,
+          chunk.sourceId || chunk.source?.id,
+        );
+      chunk.id = stableChunkId;
       if (!contextText || !contextText.trim()) {
         if (verbose) {
           log(`${label} chunk content empty, skipping`);
       if (verbose) {
         log(`\n🧩 ${label} generating questions from chunk…`);
+        log(`   [question] chunk id: ${chunk.id}`);
         log(
           '   [question] chunk preview:\n   ' +
             preview(contextText, 300).replace(/\n/g, '\n   '),
         );
       }
+      // 1) generate questions from this chunk (or load cache)
       let qResult;
+      let questions = [];
       try {
+        const cachedQRecords = await getCachedQuestions(stableChunkId);
+        if (cachedQRecords.length > 0) {
+          questions = cachedQRecords[0].questions.slice(
+            0,
+            maxQuestionsPerChunk,
+          );
+          if (verbose)
+            log(
+              `   [question] loaded ${questions.length} cached question(s)`,
+            );
+        } else {
+          qResult = await runQuestionGenerator(
+            contextText,
+            questionProvider,
+            { maxQuestions: maxQuestionsPerChunk },
+          );
+          questions = qResult?.questions || [];
+          if (questions.length > 0) {
+            await saveQuestions(stableChunkId, questions, {
+              provider: process.env.QUESTION_PROVIDER,
+              model: process.env.QUESTION_MODEL,
+            });
+          }
+        }
       } catch (e) {
         const msg = e?.message || String(e);
         statusCounts.question_error =
         continue;
       }
       if (verbose) {
         log(
           `   [question] generated ${questions.length} question(s) from this chunk`,
         if (processed >= questionCap) break;
         if (!q || !q.trim()) continue;
+        const qId = questionId(stableChunkId, q);
+        const cachedReward = await getCachedReward(stableChunkId, qId);
+        if (cachedReward?.ok) {
+          processed += 1;
+          accepted += 1;
+          statusCounts.cached_reward =
+            (statusCounts.cached_reward || 0) + 1;
+          if (verbose)
+            log(
+              `   → [q ${processed}] using cached reward, skipping stages`,
+            );
+          continue;
+        }
+        const cachedGen = await getCachedGeneration(stableChunkId, qId);
         const qLabel = `[q ${processed + 1}]`;
         log(
           `   → ${qLabel} Running pipeline for generated question: "${q}"`,
           const result = await runPipelineStep({
             question: q,
             initialContext: [chunk], // IMPORTANT: reuse SAME chunk, no second retrieval
+            cachedGen,
             verbose,
             logger,
           });
             log(`     ↳ status: ${result.status}`);
           }
+          let savedGenRecord = null;
+          if (result.gen) {
+            savedGenRecord = await saveGeneration(stableChunkId, qId, result.gen, {
+              provider: generatorProviderName,
+              model: process.env.GENERATOR_MODEL,
+            });
+          }
+          const genIdForFollowups =
+            savedGenRecord?.gen_id || cachedGen?.gen_id;
+          if (result.ver) {
+            await saveVerification(
+              stableChunkId,
+              qId,
+              genIdForFollowups,
+              result.ver,
+              {
+                provider: verifierProviderName,
+                model: process.env.VERIFIER_MODEL,
+              },
+            );
+          }
+          if (result.rew) {
+            await saveReward(
+              stableChunkId,
+              qId,
+              genIdForFollowups,
+              result.rew,
+              { provider: rewardProviderName, model: process.env.REWARD_MODEL },
+            );
+          }
           if (result.status === 'accepted') {
             const record = {
               question: q,

src/pipeline/cache.mjs ADDED Viewed

	@@ -0,0 +1,177 @@

+// src/pipeline/cache.mjs
+// Lightweight JSONL cache for intermediate pipeline artifacts.
+import fs from 'fs/promises';
+import path from 'path';
+import { PROJECT_ROOT } from './util.mjs';
+import {
+  chunkIdFromContent,
+  questionId,
+  generationId,
+  normalizeText,
+} from './ids.mjs';
+const CUSTOM_CACHE_DIR = process.env.PIPELINE_CACHE_DIR;
+const CACHE_DIR = CUSTOM_CACHE_DIR
+  ? (path.isAbsolute(CUSTOM_CACHE_DIR)
+      ? CUSTOM_CACHE_DIR
+      : path.join(PROJECT_ROOT, CUSTOM_CACHE_DIR))
+  : path.join(PROJECT_ROOT, 'data', 'cache');
+const FILES = {
+  questions: 'questions.jsonl',
+  generations: 'generations.jsonl',
+  verifications: 'verifications.jsonl',
+  rewards: 'rewards.jsonl',
+};
+async function ensureDir() {
+  await fs.mkdir(CACHE_DIR, { recursive: true });
+}
+async function appendJsonl(fileName, record) {
+  await ensureDir();
+  const line = JSON.stringify(record) + '\n';
+  await fs.appendFile(path.join(CACHE_DIR, fileName), line, 'utf8');
+}
+async function readJsonl(fileName, predicate) {
+  const filePath = path.join(CACHE_DIR, fileName);
+  try {
+    const txt = await fs.readFile(filePath, 'utf8');
+    const lines = txt
+      .split('\n')
+      .map((l) => l.trim())
+      .filter(Boolean);
+    const parsed = lines.map((l) => {
+      try {
+        return JSON.parse(l);
+      } catch {
+        return null;
+      }
+    }).filter(Boolean);
+    return predicate ? parsed.filter(predicate) : parsed;
+  } catch (e) {
+    if (e.code === 'ENOENT') return [];
+    throw e;
+  }
+}
+// ---------------------------
+// Question cache
+// ---------------------------
+export async function getCachedQuestions(chunkId) {
+  return readJsonl(
+    FILES.questions,
+    (r) => r.chunk_id === chunkId && Array.isArray(r.questions) && r.questions.length > 0,
+  );
+}
+export async function saveQuestions(chunkId, questions, meta = {}) {
+  const ts = Date.now();
+  const record = {
+    chunk_id: chunkId,
+    questions,
+    question_ids: questions.map((q) => questionId(chunkId, q)),
+    provider: meta.provider,
+    model: meta.model,
+    ts,
+  };
+  await appendJsonl(FILES.questions, record);
+}
+// ---------------------------
+// Generator cache
+// ---------------------------
+export async function getCachedGeneration(chunkId, qId) {
+  const records = await readJsonl(
+    FILES.generations,
+    (r) => r.chunk_id === chunkId && r.question_id === qId && r.answer,
+  );
+  // return the latest match if multiple
+  return records.length ? records[records.length - 1] : null;
+}
+export async function saveGeneration(chunkId, qId, gen, meta = {}) {
+  if (!gen) return;
+  const gen_id = generationId(chunkId, qId, gen.answer || gen.raw || '');
+  const ts = Date.now();
+  const record = {
+    chunk_id: chunkId,
+    question_id: qId,
+    gen_id,
+    answer: gen.answer || gen.raw || '',
+    thought: gen.thought,
+    raw: gen.raw,
+    provider: meta.provider,
+    model: meta.model,
+    ts,
+  };
+  await appendJsonl(FILES.generations, record);
+  return record;
+}
+// ---------------------------
+// Verification cache
+// ---------------------------
+export async function getCachedVerification(chunkId, qId, genId) {
+  const records = await readJsonl(
+    FILES.verifications,
+    (r) =>
+      r.chunk_id === chunkId &&
+      r.question_id === qId &&
+      (!genId || r.gen_id === genId),
+  );
+  return records.length ? records[records.length - 1] : null;
+}
+export async function saveVerification(chunkId, qId, genId, ver, meta = {}) {
+  if (!ver) return;
+  const ts = Date.now();
+  const record = {
+    chunk_id: chunkId,
+    question_id: qId,
+    gen_id: genId,
+    ok: ver.ok === true,
+    raw: ver.raw,
+    provider: meta.provider,
+    model: meta.model,
+    ts,
+  };
+  await appendJsonl(FILES.verifications, record);
+  return record;
+}
+// ---------------------------
+// Reward cache
+// ---------------------------
+export async function getCachedReward(chunkId, qId, genId) {
+  const records = await readJsonl(
+    FILES.rewards,
+    (r) =>
+      r.chunk_id === chunkId &&
+      r.question_id === qId &&
+      (!genId || r.gen_id === genId),
+  );
+  return records.length ? records[records.length - 1] : null;
+}
+export async function saveReward(chunkId, qId, genId, rew, meta = {}) {
+  if (!rew) return;
+  const ts = Date.now();
+  const record = {
+    chunk_id: chunkId,
+    question_id: qId,
+    gen_id: genId,
+    score: rew.score,
+    ok: rew.ok === true,
+    raw: rew.raw,
+    provider: meta.provider,
+    model: meta.model,
+    ts,
+  };
+  await appendJsonl(FILES.rewards, record);
+  return record;
+}
+// Utility export for hashing reuse
+export { chunkIdFromContent, questionId, generationId, normalizeText };

src/pipeline/ids.mjs ADDED Viewed

	@@ -0,0 +1,24 @@

+// src/pipeline/ids.mjs
+// Helpers for deterministic IDs across pipeline stages.
+import crypto from 'crypto';
+export function normalizeText(text = '') {
+  return String(text).replace(/\s+/g, ' ').trim();
+}
+export function hashString(str) {
+  return crypto.createHash('sha256').update(str).digest('hex');
+}
+export function chunkIdFromContent(content, sourceId) {
+  const base = normalizeText(content);
+  return hashString(`${base}|${sourceId ?? ''}`);
+}
+export function questionId(chunkId, question) {
+  return hashString(`${chunkId}|${normalizeText(question)}`);
+}
+export function generationId(chunkId, questionId, answer) {
+  return hashString(`${chunkId}|${questionId}|${normalizeText(answer)}`);
+}

src/pipeline/step.mjs CHANGED Viewed

@@ -48,6 +48,7 @@ export async function runPipelineStep({
   generatorProvider,
   verifierProvider,
   rewardProvider,
   verbose = false,
   logger = console,
 } = {}) {
@@ -147,23 +148,28 @@ export async function runPipelineStep({
   // Generator
   // ----------------------------------------
   let gen;
-  try {
-    if (verbose) log('   [generator] calling model…');
-    gen = await runGenerator(question, context, genProv);
-    if (verbose) {
-      log('   [generator] answer:');
-      log('   ' + preview(gen?.answer ?? '', 400).replace(/\n/g, '\n   '));
     }
-  } catch (e) {
-    const msg = e?.message || String(e);
-    if (verbose) errLog('   [generator] ERROR:', msg);
-    return {
-      status: 'generator_failed',
-      question,
-      context,
-      error: msg,
-    };
   }
   // Empty answer means generator failed

   generatorProvider,
   verifierProvider,
   rewardProvider,
+  cachedGen,
   verbose = false,
   logger = console,
 } = {}) {
   // Generator
   // ----------------------------------------
   let gen;
+  if (cachedGen) {
+    gen = cachedGen;
+    if (verbose) log('   [generator] using cached generation');
+  } else {
+    try {
+      if (verbose) log('   [generator] calling model…');
+      gen = await runGenerator(question, context, genProv);
+      if (verbose) {
+        log('   [generator] answer:');
+        log('   ' + preview(gen?.answer ?? '', 400).replace(/\n/g, '\n   '));
+      }
+    } catch (e) {
+      const msg = e?.message || String(e);
+      if (verbose) errLog('   [generator] ERROR:', msg);
+      return {
+        status: 'generator_failed',
+        question,
+        context,
+        error: msg,
+      };
     }
   }
   // Empty answer means generator failed

src/retrieval/jsonl_chunks.mjs CHANGED Viewed

@@ -3,6 +3,7 @@ import fs from 'fs/promises';
 import path from 'path';
 import crypto from 'crypto';
 import { PROJECT_ROOT } from '../pipeline/util.mjs';
 const DEFAULT_RAG_PATH = path.join(
   PROJECT_ROOT,
@@ -52,14 +53,17 @@ async function loadAllChunksFromJsonl(filePath = DEFAULT_RAG_PATH) {
       obj.body ||
       '';
-    const id =
       obj.id ||
       obj.session_key ||
       obj.title ||
       `jsonl-${idx}`;
     return {
       id,
       content,
       source: obj,
     };

 import path from 'path';
 import crypto from 'crypto';
 import { PROJECT_ROOT } from '../pipeline/util.mjs';
+import { chunkIdFromContent } from '../pipeline/cache.mjs';
 const DEFAULT_RAG_PATH = path.join(
   PROJECT_ROOT,
       obj.body ||
       '';
+    const sourceId =
       obj.id ||
       obj.session_key ||
       obj.title ||
       `jsonl-${idx}`;
+    const id = chunkIdFromContent(content, sourceId);
     return {
       id,
+      sourceId,
       content,
       source: obj,
     };

src/retrieval/retrieval.mjs CHANGED Viewed

@@ -2,6 +2,7 @@
 import dotenv from 'dotenv';
 import { Client } from '@elastic/elasticsearch';
 import fetch from 'node-fetch';
 dotenv.config();
@@ -160,17 +161,24 @@ export async function fetchChunksFromIndex(limit) {
   const hits = res.hits?.hits || [];
-  return hits.map((h) => ({
-    id: h._id,
-    score: h._score,
-    // distill-rag usually stores text in `content`, but we try a few keys
-    content:
       h._source?.content ??
       h._source?.text ??
       h._source?.chunk ??
-      '',
-    source: h._source,
-  }));
 }

 import dotenv from 'dotenv';
 import { Client } from '@elastic/elasticsearch';
 import fetch from 'node-fetch';
+import { chunkIdFromContent } from '../pipeline/cache.mjs';
 dotenv.config();
   const hits = res.hits?.hits || [];
+  return hits.map((h) => {
+    const content =
       h._source?.content ??
       h._source?.text ??
       h._source?.chunk ??
+      '';
+    const sourceId = h._id;
+    const id = chunkIdFromContent(content, sourceId);
+    return {
+      id,
+      sourceId,
+      score: h._score,
+      content,
+      source: h._source,
+    };
+  });
 }

tests/cache_pipeline.test.mjs ADDED Viewed

	@@ -0,0 +1,83 @@

+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import path from 'path';
+import os from 'os';
+describe('pipeline cache + deterministic IDs', () => {
+  const tmpCache = () => path.join(os.tmpdir(), `cache-${Date.now()}`);
+  beforeEach(() => {
+    vi.resetModules();
+    process.env.PIPELINE_CHUNK_SOURCE = 'jsonl';
+    process.env.PIPELINE_CACHE_DIR = tmpCache();
+  });
+  afterEach(() => {
+    delete process.env.PIPELINE_CHUNK_SOURCE;
+    delete process.env.PIPELINE_CACHE_DIR;
+    vi.restoreAllMocks();
+    vi.unmock('../src/retrieval/jsonl_chunks.mjs');
+    vi.unmock('../src/pipeline/step.mjs');
+  });
+  it('uses cached reward to skip generator/verifier/reward work', async () => {
+    const chunk = { content: 'hello world', sourceId: 's1' };
+    // Deterministic IDs and cache helpers
+    const {
+      chunkIdFromContent,
+      questionId,
+      saveQuestions,
+      saveReward,
+    } = await import('../src/pipeline/cache.mjs');
+    const chunkId = chunkIdFromContent(chunk.content, chunk.sourceId);
+    const question = 'What is said?';
+    const qId = questionId(chunkId, question);
+    // Pre-populate cache with questions and an accepted reward
+    await saveQuestions(chunkId, [question], {
+      provider: 'mock',
+      model: 'mock',
+    });
+    await saveReward(
+      chunkId,
+      qId,
+      'gen-1',
+      { score: 0.9, ok: true, raw: '0.9' },
+      { provider: 'mock', model: 'mock' },
+    );
+    // Mock chunk loader and pipeline step to observe bypass
+    vi.doMock('../src/retrieval/jsonl_chunks.mjs', () => ({
+      loadRagChunks: vi.fn(async () => [{ ...chunk, id: chunkId }]),
+    }));
+    const runPipelineStep = vi.fn();
+    vi.doMock('../src/pipeline/step.mjs', () => ({ runPipelineStep }));
+    const { runPipelineBatch } = await import('../src/pipeline/batch.mjs');
+    const result = await runPipelineBatch({
+      seedMode: 'question-first',
+      limit: 5,
+      verbose: false,
+      logger: { log() {}, error() {} },
+    });
+    expect(runPipelineStep).not.toHaveBeenCalled();
+    expect(result.processed).toBe(1);
+    expect(result.accepted).toBe(1);
+    expect(result.statusCounts.cached_reward).toBe(1);
+  });
+  it('stable chunk ids despite whitespace differences', async () => {
+    const { chunkIdFromContent } = await import('../src/pipeline/ids.mjs');
+    const idA = chunkIdFromContent('Hello   world', 'doc1');
+    const idB = chunkIdFromContent('Hello world', 'doc1');
+    const idC = chunkIdFromContent('Hello world!', 'doc1');
+    expect(idA).toBe(idB);
+    expect(idA).not.toBe(idC); // punctuation changes content hash
+  });
+});

tests/pipeline.full.mock.test.mjs CHANGED Viewed

@@ -14,6 +14,10 @@ describe('full pipeline (mock providers)', () => {
     process.env.VERIFIER_PROVIDER = 'mock';
     process.env.REWARD_PROVIDER = 'mock';
     process.env.QUESTION_PROVIDER = 'mock';
   });
   afterEach(() => {
@@ -22,6 +26,7 @@ describe('full pipeline (mock providers)', () => {
     delete process.env.VERIFIER_PROVIDER;
     delete process.env.REWARD_PROVIDER;
     delete process.env.QUESTION_PROVIDER;
     vi.restoreAllMocks();
     vi.unmock('../src/retrieval/jsonl_chunks.mjs');
   });

     process.env.VERIFIER_PROVIDER = 'mock';
     process.env.REWARD_PROVIDER = 'mock';
     process.env.QUESTION_PROVIDER = 'mock';
+    process.env.PIPELINE_CACHE_DIR = path.join(
+      os.tmpdir(),
+      `cache-${Date.now()}`,
+    );
   });
   afterEach(() => {
     delete process.env.VERIFIER_PROVIDER;
     delete process.env.REWARD_PROVIDER;
     delete process.env.QUESTION_PROVIDER;
+    delete process.env.PIPELINE_CACHE_DIR;
     vi.restoreAllMocks();
     vi.unmock('../src/retrieval/jsonl_chunks.mjs');
   });

tests/pipeline_behaviour.test.mjs CHANGED Viewed

@@ -74,10 +74,15 @@ describe('runPipelineBatch question cap', () => {
   beforeEach(() => {
     vi.resetModules();
     process.env.PIPELINE_CHUNK_SOURCE = 'jsonl';
   });
   afterEach(() => {
     delete process.env.PIPELINE_CHUNK_SOURCE;
     vi.restoreAllMocks();
     vi.unmock('../src/providers/provider.mjs');
     vi.unmock('../src/retrieval/jsonl_chunks.mjs');

   beforeEach(() => {
     vi.resetModules();
     process.env.PIPELINE_CHUNK_SOURCE = 'jsonl';
+    process.env.PIPELINE_CACHE_DIR = path.join(
+      os.tmpdir(),
+      `cache-${Date.now()}`,
+    );
   });
   afterEach(() => {
     delete process.env.PIPELINE_CHUNK_SOURCE;
+    delete process.env.PIPELINE_CACHE_DIR;
     vi.restoreAllMocks();
     vi.unmock('../src/providers/provider.mjs');
     vi.unmock('../src/retrieval/jsonl_chunks.mjs');