updateda bunch of stuff

Browse files

Files changed (14) hide show

AGENTS.md +3 -0
README.md +1 -0
prompts/generator_prompt.txt +45 -55
scripts/cache_report.mjs +104 -0
scripts/gold_preview.mjs +193 -0
scripts/purge_mock_gold.mjs +66 -0
scripts/regenerate_gold_from_cache.mjs +192 -0
src/generator/generator_core.mjs +115 -8
src/pipeline/batch.mjs +31 -2
src/reward/reward_core.mjs +1 -1
state_of_project.md +24 -0
tests/generator_core.test.mjs +30 -10
tests/gold_preview.test.mjs +65 -0
tests/regenerate_gold_from_cache.test.mjs +112 -0

AGENTS.md CHANGED Viewed

@@ -6,14 +6,17 @@
 - Tests sit in `tests/` (Vitest), with sample seeds in `test_samples/`; pipeline outputs write to `gold/`.
 - Config baselines (models, limits) are in `configs/pipeline.json`; run scripts live at the repo root (`run.sh`, `try_prompt.sh`).
 - Cached intermediates (questions/gens/verifications/rewards) live in `data/cache/*.jsonl`; set `PIPELINE_CACHE_DIR` to redirect.
 ## Build, Test, and Development Commands
 - `npm install` – install dependencies.
 - `npm run pipeline -- --limit 20 --verbose` – run the default pipeline using static seeds.
 - `PIPELINE_SEED_MODE=question-first npm run pipeline -- --limit 20 --verbose` – enable question-first seeding.
 - `npm test` – run all unit tests (mocked by default).
 - `REAL_ES=1 npm test` – exercise retrieval against a live Elasticsearch + embedding endpoint.
 - Red/green pathway: use `*_PROVIDER=mock` plus JSONL chunk source to dry-run (green) without models; switch to real providers for red runs and the cache will skip already-completed stages.
 ## Coding Style & Naming Conventions
 - ECMAScript modules (`type: "module"`); prefer `.mjs` for shared code.

 - Tests sit in `tests/` (Vitest), with sample seeds in `test_samples/`; pipeline outputs write to `gold/`.
 - Config baselines (models, limits) are in `configs/pipeline.json`; run scripts live at the repo root (`run.sh`, `try_prompt.sh`).
 - Cached intermediates (questions/gens/verifications/rewards) live in `data/cache/*.jsonl`; set `PIPELINE_CACHE_DIR` to redirect.
+- Random walk over chunks: set `PIPELINE_RANDOM_WALK=1` (or `PIPELINE_CHUNK_ORDER=random`) to shuffle chunk order using crypto randomness.
 ## Build, Test, and Development Commands
 - `npm install` – install dependencies.
 - `npm run pipeline -- --limit 20 --verbose` – run the default pipeline using static seeds.
 - `PIPELINE_SEED_MODE=question-first npm run pipeline -- --limit 20 --verbose` – enable question-first seeding.
+- Random-walk mode: `PIPELINE_RANDOM_WALK=1 QUESTION_MAX_PER_CHUNK=3 npm run pipeline -- --limit 3 --chunk-limit 10` shuffles chunks, caps questions per chunk at 3, processes at most 3 questions overall, and samples up to 10 chunks.
 - `npm test` – run all unit tests (mocked by default).
 - `REAL_ES=1 npm test` – exercise retrieval against a live Elasticsearch + embedding endpoint.
 - Red/green pathway: use `*_PROVIDER=mock` plus JSONL chunk source to dry-run (green) without models; switch to real providers for red runs and the cache will skip already-completed stages.
+- Verifier contract: models return JSON `{"REASONING": [...], "SCORE": <number|\"PASS\"|\"FAIL\">}`; SCORE >=0.5 or PASS → accepted. Prompt must remain unchanged; parsing is tolerant of the PASS/FAIL token format.
 ## Coding Style & Naming Conventions
 - ECMAScript modules (`type: "module"`); prefer `.mjs` for shared code.

README.md CHANGED Viewed

@@ -82,6 +82,7 @@ All pure modules include Vitest coverage:
 * question generation
 * provider router
 * pipeline integration (mock)
 ---

 * question generation
 * provider router
 * pipeline integration (mock)
+* JSONL cache, PASS/FAIL verifier parsing
 ---

prompts/generator_prompt.txt CHANGED Viewed

@@ -1,61 +1,51 @@
-# SYSTEM ROLE
-You are a knowledge distillation generator optimized for training reasoning LoRAs. Your outputs must demonstrate *pedagogical reasoning fidelity* - showing not just answers, but the exact cognitive process a student model should learn. Every output will be used as training data.
-## CORE DIRECTIVES (NON-NEGOTIABLE)
-1. **CONTEXT FIDELITY**: Use ONLY provided context. No external knowledge. Ever.
-2. **REASONING GRANULARITY**: Decompose reasoning into atomic, teachable steps
-3. **UNCERTAINTY CALIBRATION**: Quantify confidence at each reasoning stage
-4. **BIAS MITIGATION**: Explicitly flag context limitations and reasoning risks
-5. **DISTILLATION OPTIMIZATION**: Structure outputs for maximum LoRA weight efficiency
-## REASONING PROTOCOL (EXECUTE IN ORDER INSIDE XML TAGS)
-<understanding>
-- Restate question in atomic components
-- Identify: [Simple/Factual] vs [Multi-hop/Inferential] vs [Ambiguous]
-- Flag required context elements (quote paragraph numbers)
-</understanding>
-<context_verification>
-- For EACH required fact:
-  ▸ Cite exact context location (para #[X])
-  ▸ Assess source quality: [Primary/Secondary/Contradictory/Uncertain]
-  ▸ If missing/insufficient: TERMINATE with "I cannot answer..."
-</context_verification>
-<reasoning_chain confidence_baseline="90%">
-[STRUCTURED STEP FORMAT PER STEP]
-Step #[N]:
-- Operation: [Retrieval/Comparison/Causality/Quantification/Contradiction-Check]
-- Context evidence: "Short quote" (para #[X])
-- Confidence delta: [+0%/-5% etc.] due to [reason]
-- Inference rule used: [e.g., "Temporal transitivity", "Numerical constraint propagation"]
-- Bias check: [None/Selection bias/Uncertainty propagation risk]
-</reasoning_chain>
-<synthesis>
-- Resolve conflicts between steps
-- Calculate cumulative confidence: (baseline * step confidences)
-- Final confidence threshold: <80% → "I cannot answer..."
-- Verify against reasoning_chain constraints
-</synthesis>
-## OUTPUT SPECIFICATION (MACHINE-PARSIABLE)
-After </synthesis>:
-Confidence: [INTEGER 0-100]
-Answer: [CONCISE RESPONSE OR EXACT FALLBACK PHRASE]
-Evidence: [MAX 3 SHORT PHRASES] | [PARA #S]
-Uncertainty_flags: [NONE/CONTEXT_GAPS/CONTRADICTIONS/BIAS_RISK]
-## STRICT FORMATTING RULES
-- XML tags MUST close properly
-- Evidence phrases: ≤7 words each
-- Confidence calculations must show work in <synthesis>
-- If context_verification fails: OUTPUT ONLY "I cannot answer this from the provided context." (NO tags)
-- NEVER use markdown, asterisks, or special formatting
 ---
 CONTEXT:
-{{CONTEXT}}
 QUESTION:
 {{QUESTION}}

+<|system|>
+You are a knowledge distillation generator for training compact reasoning models. Your outputs MUST demonstrate pedagogical reasoning fidelity using Qwen3's native thinking protocol. Every output becomes gold training data.
+<|rules|>
+NON-NEGOTIABLE:
+1. CONTEXT FIDELITY: Use ONLY visible elements in the CONTEXT (text, objects, relationships). No external knowledge.
+2. VL-AWARE GROUNDING: Reference visual/text elements by position ("top-left graphic", "second paragraph") – NOT paragraph numbers.
+3. CONFIDENCE SIMPLIFICATION: Use ONLY [High/Medium/Low] confidence levels – NO percentages or deltas.
+4. FAILURE MODE: If context lacks critical evidence → output EXACTLY: "I cannot answer this from the provided context."
+5. BIAS FLAGGING: Add "LIMITATION:" note ONLY for severe gaps/contradictions.
+<|reasoning_protocol|>
+<|thought|>
+1. UNDERSTAND QUESTION:
+   - Break into atomic sub-questions
+   - Classify: [Factual] / [Visual-Text Fusion] / [Multi-Step Inference] / [Ambiguous]
+2. CONTEXT VERIFICATION:
+   - For EACH required element:
+     • Describe location: "Table 3 in bottom-right", "Caption under Figure 2"
+     • Quote EXACT visible text snippet (≤10 words)
+     • Mark quality: [Clear] / [Blurry/Partial] / [Contradictory]
+3. STEPWISE REASONING:
+   Step 1: [Action type: e.g., "Extract value from chart"]
+   Evidence: "Exact phrase" (location description)
+   Confidence: High/Medium/Low + 3-word reason ("low-contrast text")
+   Step 2: [Next action]... (repeat as needed)
+4. SYNTHESIS:
+   - Resolve conflicts between steps
+   - Final confidence: [High/Medium/Low]
+   - LIMITATION: [None / Missing visual element / Text ambiguity]
+<|end_of_thought|>
+<|output_format|>
+<|answer|>
+Confidence: [High/Medium/Low]
+Answer: [Concise response OR exact failure phrase]
+Evidence: ["Phrase 1" (location), "Phrase 2" (location)]
+Limitations: [None / ...]
+<|end_of_answer|>
 ---
 CONTEXT:
+{{CONTEXT}}
+*(Note: For VL models, this contains BOTH text + visual scene description)*
 QUESTION:
 {{QUESTION}}

scripts/cache_report.mjs ADDED Viewed

	@@ -0,0 +1,104 @@

+#!/usr/bin/env node
+// scripts/cache_report.mjs
+// Summarize cache status (questions/generations/verifications/rewards).
+import fs from 'fs/promises';
+import path from 'path';
+import { fileURLToPath } from 'url';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const PROJECT_ROOT = path.join(__dirname, '..');
+const CACHE_DIR = (() => {
+  const custom = process.env.PIPELINE_CACHE_DIR;
+  if (custom) {
+    return path.isAbsolute(custom)
+      ? custom
+      : path.join(PROJECT_ROOT, custom);
+  }
+  return path.join(PROJECT_ROOT, 'data', 'cache');
+})();
+const FILES = {
+  questions: 'questions.jsonl',
+  generations: 'generations.jsonl',
+  verifications: 'verifications.jsonl',
+  rewards: 'rewards.jsonl',
+};
+async function readJsonl(fileName) {
+  const filePath = path.join(CACHE_DIR, fileName);
+  try {
+    const txt = await fs.readFile(filePath, 'utf8');
+    return txt
+      .split('\n')
+      .map((l) => l.trim())
+      .filter(Boolean)
+      .map((line) => {
+        try {
+          return JSON.parse(line);
+        } catch {
+          return null;
+        }
+      })
+      .filter(Boolean);
+  } catch (e) {
+    if (e.code === 'ENOENT') return [];
+    throw e;
+  }
+}
+function uniq(arr) {
+  return [...new Set(arr)];
+}
+async function main() {
+  const questions = await readJsonl(FILES.questions);
+  const generations = await readJsonl(FILES.generations);
+  const verifications = await readJsonl(FILES.verifications);
+  const rewards = await readJsonl(FILES.rewards);
+  const chunkIds = uniq([
+    ...questions.map((r) => r.chunk_id),
+    ...generations.map((r) => r.chunk_id),
+    ...verifications.map((r) => r.chunk_id),
+    ...rewards.map((r) => r.chunk_id),
+  ].filter(Boolean));
+  const totalQuestions = questions.reduce((acc, r) => {
+    if (Array.isArray(r.questions)) return acc + r.questions.length;
+    if (Array.isArray(r.question_ids)) return acc + r.question_ids.length;
+    return acc + 1;
+  }, 0);
+  const totalGenerations = generations.length;
+  const totalVerifications = verifications.length;
+  const totalRewards = rewards.length;
+  const passedVerifications = verifications.filter((v) => v.ok === true).length;
+  const passedRewards = rewards.filter((r) => r.ok === true).length;
+  const rows = [
+    ['Cache dir', CACHE_DIR],
+    ['Unique chunks', chunkIds.length],
+    ['Question records', questions.length],
+    ['Questions total', totalQuestions],
+    ['Generation records', totalGenerations],
+    ['Verification records', totalVerifications],
+    ['Verifications ok', passedVerifications],
+    ['Reward records', totalRewards],
+    ['Rewards ok', passedRewards],
+  ];
+  const colWidth = Math.max(...rows.map(([k]) => k.length)) + 2;
+  for (const [key, val] of rows) {
+    const pad = ' '.repeat(colWidth - key.length);
+    console.log(`${key}:${pad}${val}`);
+  }
+}
+main().catch((err) => {
+  console.error('Cache report error:', err);
+  process.exit(1);
+});

scripts/gold_preview.mjs ADDED Viewed

	@@ -0,0 +1,193 @@

+#!/usr/bin/env node
+// scripts/gold_preview.mjs
+// Quick preview of gold JSONL entries (questions and answers).
+import fs from 'fs/promises';
+import path from 'path';
+import { fileURLToPath } from 'url';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const PROJECT_ROOT = path.join(__dirname, '..');
+function parseArgs() {
+  const args = process.argv.slice(2);
+  let limit = 5;
+  let fileArg;
+  let full = false;
+  let maxQuestion = 500;
+  let maxAnswer = 800;
+  let maxContext = 300;
+  for (let i = 0; i < args.length; i++) {
+    const a = args[i];
+    if (a === '--limit' || a === '-n') {
+      const v = Number(args[i + 1]);
+      if (Number.isFinite(v)) limit = v;
+      i++;
+    } else if (a === '--file' || a === '-f') {
+      fileArg = args[i + 1];
+      i++;
+    } else if (a === '--full') {
+      full = true;
+    } else if (a === '--max-question') {
+      const v = Number(args[i + 1]);
+      if (Number.isFinite(v)) maxQuestion = v;
+      i++;
+    } else if (a === '--max-answer') {
+      const v = Number(args[i + 1]);
+      if (Number.isFinite(v)) maxAnswer = v;
+      i++;
+    } else if (a === '--max-context') {
+      const v = Number(args[i + 1]);
+      if (Number.isFinite(v)) maxContext = v;
+      i++;
+    }
+  }
+  const goldPath =
+    fileArg ||
+    process.env.GOLD_PATH ||
+    path.join(PROJECT_ROOT, 'gold', 'pipeline_gold.jsonl');
+  if (full) {
+    maxQuestion = Infinity;
+    maxAnswer = Infinity;
+    maxContext = Infinity;
+  }
+  return { limit, goldPath, full, maxQuestion, maxAnswer, maxContext };
+}
+function preview(text, max = 200, full = false) {
+  if (full) return Array.isArray(text) ? text.join(' ') : String(text ?? '');
+  if (text == null) return '';
+  const str = Array.isArray(text) ? text.join(' ') : String(text);
+  if (str.length <= max) return str;
+  return str.slice(0, max) + `… [+${str.length - max} chars]`;
+}
+async function main() {
+  const {
+    limit,
+    goldPath,
+    full,
+    maxQuestion,
+    maxAnswer,
+    maxContext,
+  } = parseArgs();
+  let raw;
+  try {
+    raw = await fs.readFile(goldPath, 'utf8');
+  } catch (e) {
+    if (e.code === 'ENOENT') {
+      console.error(`Gold file not found: ${goldPath}`);
+      process.exit(1);
+    }
+    throw e;
+  }
+  const lines = raw
+    .split('\n')
+    .map((l) => l.trim())
+    .filter(Boolean)
+    .slice(0, limit);
+  console.log(`Gold preview (${lines.length} of max ${limit}) from ${goldPath}\n`);
+  lines.forEach((line, idx) => {
+    let obj;
+    try {
+      obj = JSON.parse(line);
+    } catch {
+      console.log(`#${idx + 1}: [invalid JSON] ${preview(line, 120)}`);
+      return;
+    }
+    const q = obj.question || '[no question]';
+    const ans = obj.sample?.answer || obj.sample?.raw || '[no answer]';
+    const chunkId = obj.sourceChunkId || obj.context?.[0]?.id || '[unknown chunk]';
+    const ctxSnippet = obj.context?.[0]?.content || obj.sourceChunk || '';
+    const rew = obj.reward?.score ?? obj.reward?.ok;
+    const verOk = obj.verifier?.ok ?? obj.ver?.ok;
+    const verScore = obj.verifier?.score ?? obj.ver?.score;
+    console.log(`#${idx + 1}`);
+    console.log(`Chunk: ${chunkId}`);
+    console.log(`Q: ${preview(q, maxQuestion, full)}`);
+    console.log(`A: ${preview(ans, maxAnswer, full)}`);
+    if (ctxSnippet) console.log(`Ctx: ${preview(ctxSnippet, maxContext, full)}`);
+    if (verOk !== undefined) console.log(`Verifier ok: ${verOk}${verScore !== undefined ? ` (score: ${verScore})` : ''}`);
+    if (rew !== undefined) console.log(`Reward: ${rew}`);
+    console.log('');
+  });
+}
+// Exported for tests
+export async function capturePreview() {
+  const {
+    limit,
+    goldPath,
+    full,
+    maxQuestion,
+    maxAnswer,
+    maxContext,
+  } = parseArgs();
+  let raw;
+  try {
+    raw = await fs.readFile(goldPath, 'utf8');
+  } catch (e) {
+    if (e.code === 'ENOENT') {
+      throw new Error(`Gold file not found: ${goldPath}`);
+    }
+    throw e;
+  }
+  const lines = raw
+    .split('\n')
+    .map((l) => l.trim())
+    .filter(Boolean)
+    .slice(0, limit);
+  const chunks = [];
+  chunks.push(`Gold preview (${lines.length} of max ${limit}) from ${goldPath}\n`);
+  lines.forEach((line, idx) => {
+    let obj;
+    try {
+      obj = JSON.parse(line);
+    } catch {
+      chunks.push(`#${idx + 1}: [invalid JSON] ${preview(line, 120)}`);
+      return;
+    }
+    const q = obj.question || '[no question]';
+    const ans = obj.sample?.answer || obj.sample?.raw || '[no answer]';
+    const chunkId = obj.sourceChunkId || obj.context?.[0]?.id || '[unknown chunk]';
+    const ctxSnippet = obj.context?.[0]?.content || obj.sourceChunk || '';
+    const rew = obj.reward?.score ?? obj.reward?.ok;
+    const verOk = obj.verifier?.ok ?? obj.ver?.ok;
+    const verScore = obj.verifier?.score ?? obj.ver?.score;
+    chunks.push(`#${idx + 1}`);
+    chunks.push(`Chunk: ${chunkId}`);
+    chunks.push(`Q: ${preview(q, maxQuestion, full)}`);
+    chunks.push(`A: ${preview(ans, maxAnswer, full)}`);
+    if (ctxSnippet) chunks.push(`Ctx: ${preview(ctxSnippet, maxContext, full)}`);
+    if (verOk !== undefined) chunks.push(`Verifier ok: ${verOk}${verScore !== undefined ? ` (score: ${verScore})` : ''}`);
+    if (rew !== undefined) chunks.push(`Reward: ${rew}`);
+    chunks.push('');
+  });
+  return chunks.join('\n');
+}
+if (process.argv[1] && process.argv[1].endsWith('gold_preview.mjs')) {
+  main().catch((err) => {
+    console.error('Gold preview error:', err);
+    process.exit(1);
+  });
+}

scripts/purge_mock_gold.mjs ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/usr/bin/env node
+// scripts/purge_mock_gold.mjs
+// Remove gold entries with mock/test questions (e.g., "Q1?") from pipeline_gold.jsonl
+import fs from 'fs/promises';
+import path from 'path';
+import { fileURLToPath } from 'url';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const PROJECT_ROOT = path.join(__dirname, '..');
+const GOLD_PATH =
+  process.env.GOLD_PATH ||
+  path.join(PROJECT_ROOT, 'gold', 'pipeline_gold.jsonl');
+const questionRegex = new RegExp(process.env.PURGE_QUESTION_REGEX || '^Q1\\?$');
+async function main() {
+  let raw;
+  try {
+    raw = await fs.readFile(GOLD_PATH, 'utf8');
+  } catch (e) {
+    if (e.code === 'ENOENT') {
+      console.error(`Gold file not found: ${GOLD_PATH}`);
+      process.exit(1);
+    }
+    throw e;
+  }
+  const lines = raw
+    .split('\n')
+    .map((l) => l.trim())
+    .filter(Boolean);
+  const kept = [];
+  const removed = [];
+  for (const line of lines) {
+    try {
+      const obj = JSON.parse(line);
+      const q = obj.question || '';
+      if (questionRegex.test(q)) {
+        removed.push(line);
+      } else {
+        kept.push(line);
+      }
+    } catch {
+      // if invalid JSON, keep it to avoid accidental loss
+      kept.push(line);
+    }
+  }
+  if (removed.length === 0) {
+    console.log('No matching entries to purge.');
+    return;
+  }
+  await fs.writeFile(GOLD_PATH, kept.join('\n') + '\n', 'utf8');
+  console.log(`Purged ${removed.length} entries matching ${questionRegex}. Kept ${kept.length}.`);
+}
+main().catch((err) => {
+  console.error('Purge error:', err);
+  process.exit(1);
+});

scripts/regenerate_gold_from_cache.mjs ADDED Viewed

	@@ -0,0 +1,192 @@

+#!/usr/bin/env node
+// scripts/regenerate_gold_from_cache.mjs
+// Regenerate gold/pipeline_gold.jsonl from cache JSONL files.
+import fs from 'fs/promises';
+import path from 'path';
+import { fileURLToPath } from 'url';
+import { loadRagChunks } from '../src/retrieval/jsonl_chunks.mjs';
+import {
+  questionId,
+  chunkIdFromContent,
+} from '../src/pipeline/cache.mjs';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const PROJECT_ROOT = path.join(__dirname, '..');
+const CACHE_DIR = (() => {
+  const custom = process.env.PIPELINE_CACHE_DIR;
+  if (custom) {
+    return path.isAbsolute(custom)
+      ? custom
+      : path.join(PROJECT_ROOT, custom);
+  }
+  return path.join(PROJECT_ROOT, 'data', 'cache');
+})();
+const GOLD_PATH =
+  process.env.GOLD_PATH ||
+  path.join(PROJECT_ROOT, 'gold', 'pipeline_gold.jsonl');
+const CACHE_FILES = {
+  questions: 'questions.jsonl',
+  generations: 'generations.jsonl',
+  verifications: 'verifications.jsonl',
+  rewards: 'rewards.jsonl',
+};
+async function readJsonl(fileName) {
+  const filePath = path.join(CACHE_DIR, fileName);
+  try {
+    const txt = await fs.readFile(filePath, 'utf8');
+    return txt
+      .split('\n')
+      .map((l) => l.trim())
+      .filter(Boolean)
+      .map((line) => {
+        try {
+          return JSON.parse(line);
+        } catch {
+          return null;
+        }
+      })
+      .filter(Boolean);
+  } catch (e) {
+    if (e.code === 'ENOENT') return [];
+    throw e;
+  }
+}
+function compositeKey(...parts) {
+  return parts.filter(Boolean).join('|');
+}
+async function loadChunksMap() {
+  const chunks = await loadRagChunks();
+  const map = new Map();
+  for (const c of chunks) {
+    const cid = c.id || chunkIdFromContent(c.content, c.sourceId || c.source?.id);
+    map.set(cid, c);
+  }
+  return map;
+}
+function latestByTs(records, keyFn) {
+  const map = new Map();
+  for (const r of records) {
+    const key = keyFn(r);
+    if (!key) continue;
+    const existing = map.get(key);
+    if (!existing || (r.ts && (!existing.ts || r.ts > existing.ts))) {
+      map.set(key, r);
+    }
+  }
+  return map;
+}
+function rewardOk(r) {
+  if (!r) return false;
+  if (r.ok === true) return true;
+  if (typeof r.score === 'number') return r.score >= 0.5;
+  if (typeof r.score === 'string') {
+    const t = r.score.trim().toLowerCase();
+    if (t === 'pass') return true;
+    const num = Number(r.score);
+    if (Number.isFinite(num)) return num >= 0.5;
+  }
+  return false;
+}
+async function main() {
+  const [questions, generations, verifications, rewards] = await Promise.all([
+    readJsonl(CACHE_FILES.questions),
+    readJsonl(CACHE_FILES.generations),
+    readJsonl(CACHE_FILES.verifications),
+    readJsonl(CACHE_FILES.rewards),
+  ]);
+  const chunkMap = await loadChunksMap();
+  // Build questionId -> question text map
+  const qMap = new Map();
+  for (const rec of questions) {
+    const chunkId = rec.chunk_id;
+    if (!chunkId) continue;
+    const qs = Array.isArray(rec.questions)
+      ? rec.questions
+      : rec.question
+      ? [rec.question]
+      : [];
+    const qIds = Array.isArray(rec.question_ids) ? rec.question_ids : [];
+    for (let i = 0; i < qs.length; i++) {
+      const q = qs[i];
+      const providedId = qIds[i];
+      const hashedId = questionId(chunkId, q);
+      if (providedId) {
+        qMap.set(compositeKey(chunkId, providedId), q);
+      }
+      qMap.set(compositeKey(chunkId, hashedId), q);
+    }
+  }
+  // Latest generation per chunk+question (by ts)
+  const genMap = latestByTs(generations, (g) =>
+    compositeKey(g.chunk_id, g.question_id),
+  );
+  // Latest verification per chunk+question+gen
+  const verMap = latestByTs(verifications, (v) =>
+    compositeKey(v.chunk_id, v.question_id, v.gen_id),
+  );
+  // Latest reward per chunk+question+gen
+  const rewMap = latestByTs(rewards, (r) =>
+    compositeKey(r.chunk_id, r.question_id, r.gen_id),
+  );
+  const out = [];
+  let accepted = 0;
+  for (const [key, gen] of genMap.entries()) {
+    const [chunkId, qId] = key.split('|');
+    const question = qMap.get(compositeKey(chunkId, qId)) || '[unknown question]';
+    const chunk = chunkMap.get(chunkId) || {};
+    const context = [{ id: chunkId, content: chunk.content ?? chunk.text ?? '' }];
+    const ver = verMap.get(compositeKey(chunkId, qId, gen.gen_id));
+    const rew = rewMap.get(compositeKey(chunkId, qId, gen.gen_id));
+    const rewardIsOk = rewardOk(rew);
+    const verifierIsOk = ver?.ok === true;
+    if (!rewardIsOk && !verifierIsOk) continue;
+    accepted += 1;
+    out.push({
+      question,
+      sourceChunkId: chunkId,
+      sourceChunk: chunk.content ?? chunk.text,
+      sourceDoc: chunk.source,
+      context,
+      sample: gen,
+      verifier: ver,
+      reward: rew,
+    });
+  }
+  const lines = out.map((r) => JSON.stringify(r));
+  await fs.mkdir(path.dirname(GOLD_PATH), { recursive: true });
+  await fs.writeFile(GOLD_PATH, lines.join('\n') + '\n', 'utf8');
+  console.log(`Regenerated gold at ${GOLD_PATH}`);
+  console.log(`Accepted records: ${accepted}`);
+  console.log(`Total written: ${out.length}`);
+}
+if (import.meta.url === `file://${__filename}`) {
+  main().catch((err) => {
+    console.error('Regenerate gold error:', err);
+    process.exit(1);
+  });
+}
+export { main };

src/generator/generator_core.mjs CHANGED Viewed

@@ -22,22 +22,129 @@ export async function runGenerator(question, contextChunks, provider) {
     .replace('{{QUESTION}}', question)
     .replace('{{CONTEXT}}', ctxText);
-  const raw = await provider.generate(prompt);
-  // extract visible chain-of-thought
-  const thinkMatch = raw.match(/<think>([\s\S]*?)<\/think>/i);
-  const thought = thinkMatch ? thinkMatch[1].trim() : null;
-  // final answer = text after </think>
-  let answer = raw;
-  if (thinkMatch) {
-    answer = raw.slice(thinkMatch.index + thinkMatch[0].length).trim();
   }
   return {
     raw,
     thought,
     answer,
     question,
     context: contextChunks
   };

     .replace('{{QUESTION}}', question)
     .replace('{{CONTEXT}}', ctxText);
+  const response = await provider.generate(prompt);
+  // Normalize provider output: string or { response, thinking }
+  const raw = typeof response === 'string' ? response : response?.response ?? '';
+  const thinkingObj = typeof response === 'object' && response?.thinking ? response.thinking : null;
+  let thought = null;
+  let answer = raw?.trim?.() ?? raw;
+  let confidence = null;
+  let evidence = null;
+  let limitations = null;
+  const safeParse = (txt) => {
+    if (!txt || typeof txt !== 'string') return null;
+    try {
+      return JSON.parse(txt);
+    } catch {
+      // try to extract braces substring
+      const start = txt.indexOf('{');
+      const end = txt.lastIndexOf('}');
+      if (start !== -1 && end !== -1 && end > start) {
+        try {
+          return JSON.parse(txt.slice(start, end + 1));
+        } catch {
+          return null;
+        }
+      }
+      return null;
+    }
+  };
+  // Prefer structured thinking object if provided
+  if (thinkingObj) {
+    thought = thinkingObj;
+  }
+  // Try parsing Qwen-style answer block first
+  const parseAnswerBlock = (txt) => {
+    if (!txt || typeof txt !== 'string') return null;
+    const blockMatch = txt.match(/<\|answer\|>([\s\S]*?)<\|end_of_answer\|>/i);
+    const body = blockMatch ? blockMatch[1] : txt;
+    const lines = body.split('\n').map((l) => l.trim()).filter(Boolean);
+    const result = {};
+    for (const line of lines) {
+      if (/^confidence:/i.test(line)) {
+        const val = line.split(':')[1]?.trim();
+        result.confidence = val || null;
+      } else if (/^answer:/i.test(line)) {
+        result.answer = line.split(':').slice(1).join(':').trim();
+      } else if (/^evidence:/i.test(line)) {
+        const evLine = line.split(':').slice(1).join(':').trim();
+        // Try to parse bracketed array, else split by comma
+        let ev = [];
+        const arrMatch = evLine.match(/\[(.*)\]/);
+        if (arrMatch) {
+          ev = arrMatch[1]
+            .split(/,(?=(?:[^'"]|'[^']*'|"[^"]*")*$)/)
+            .map((s) => s.replace(/^["'\s]+|["'\s]+$/g, ''))
+            .filter(Boolean);
+        } else {
+          ev = evLine.split(',').map((s) => s.trim()).filter(Boolean);
+        }
+        result.evidence = ev;
+      } else if (/^limitations:/i.test(line)) {
+        result.limitations = line.split(':').slice(1).join(':').trim();
+      }
+    }
+    return result;
+  };
+  const blockParsed = parseAnswerBlock(raw);
+  if (blockParsed?.answer) {
+    answer = blockParsed.answer;
+    confidence = blockParsed.confidence ?? confidence;
+    evidence = blockParsed.evidence ?? evidence;
+    limitations = blockParsed.limitations ?? limitations;
+  } else {
+    // fallback: parse JSON if it's actually JSON
+    const parsed = safeParse(raw);
+    if (parsed && typeof parsed === 'object') {
+      const reasoning = parsed.reasoning || parsed.REASONING;
+      if (Array.isArray(reasoning) && !thought) {
+        thought = reasoning.join(' ');
+      }
+      const ans =
+        parsed.answer ||
+        parsed.ANSWER ||
+        parsed.final ||
+        parsed.output;
+      if (typeof ans === 'string') {
+        answer = ans.trim();
+      } else if (Array.isArray(ans)) {
+        answer = ans.join(' ').trim();
+      }
+      if (parsed.confidence != null) {
+        const num = Number(parsed.confidence);
+        if (Number.isFinite(num)) confidence = num;
+        else if (typeof parsed.confidence === 'string') confidence = parsed.confidence;
+      }
+      if (parsed.evidence) evidence = parsed.evidence;
+      if (parsed.limitations) limitations = parsed.limitations;
+    } else {
+      // fallback: extract visible chain-of-thought tags if present
+      const thinkMatch = typeof raw === 'string'
+        ? raw.match(/<think>([\s\S]*?)<\/think>/i)
+        : null;
+      thought = thought || (thinkMatch ? thinkMatch[1].trim() : null);
+      if (thinkMatch) {
+        answer = raw.slice(thinkMatch.index + thinkMatch[0].length).trim();
+      }
+    }
   }
   return {
     raw,
     thought,
     answer,
+    confidence,
+    evidence,
+    limitations,
     question,
     context: contextChunks
   };

src/pipeline/batch.mjs CHANGED Viewed

@@ -3,6 +3,7 @@ import fs from 'fs/promises';
 import path from 'path';
 import { preview } from './util.mjs';
 import {
   DEFAULT_SEEDS_PATH,
   DEFAULT_OUT_PATH,
@@ -123,7 +124,14 @@ export async function runPipelineBatch({
           const record = {
             question,
             context: result.context,
-            sample: result.gen, // generator output
             verifier: result.ver,
             reward: result.rew,
           };
@@ -224,6 +232,23 @@ export async function runPipelineBatch({
     const totalChunks = chunks.length;
     let processedSeeds = 0;
     for (let idx = 0; idx < chunks.length; idx++) {
       if (processed >= questionCap) break;
@@ -396,7 +421,11 @@ export async function runPipelineBatch({
               sourceChunk: contextText,
               sourceDoc: chunk.source,
               context: result.context,
-              sample: result.gen,
               verifier: result.ver,
               reward: result.rew,
             };

 import path from 'path';
 import { preview } from './util.mjs';
+import crypto from 'crypto';
 import {
   DEFAULT_SEEDS_PATH,
   DEFAULT_OUT_PATH,
           const record = {
             question,
             context: result.context,
+            sample: {
+              answer: result.gen?.answer,
+              thought: result.gen?.thought,
+              raw: result.gen?.raw,
+              confidence: result.gen?.confidence,
+              evidence: result.gen?.evidence,
+              limitations: result.gen?.limitations,
+            },
             verifier: result.ver,
             reward: result.rew,
           };
     const totalChunks = chunks.length;
     let processedSeeds = 0;
+    // Optional random walk over chunks
+    const randomWalk = (() => {
+      const v =
+        process.env.PIPELINE_RANDOM_WALK ||
+        process.env.PIPELINE_CHUNK_ORDER;
+      if (!v) return false;
+      const s = String(v).toLowerCase();
+      return s === '1' || s === 'true' || s === 'yes' || s === 'random';
+    })();
+    if (randomWalk && chunks.length > 1) {
+      for (let i = chunks.length - 1; i > 0; i--) {
+        const j = crypto.randomInt(i + 1);
+        [chunks[i], chunks[j]] = [chunks[j], chunks[i]];
+      }
+    }
     for (let idx = 0; idx < chunks.length; idx++) {
       if (processed >= questionCap) break;
               sourceChunk: contextText,
               sourceDoc: chunk.source,
               context: result.context,
+              sample: {
+                answer: result.gen?.answer,
+                thought: result.gen?.thought,
+                raw: result.gen?.raw,
+              },
               verifier: result.ver,
               reward: result.rew,
             };

src/reward/reward_core.mjs CHANGED Viewed

@@ -22,7 +22,7 @@ export async function runReward({ question, context, gen }, provider) {
   const prompt = tmpl
     .replace(/{{QUESTION}}/g, question)
-    .replace(/{{ANSWER}}/g, gen.answer || gen.raw)
     .replace(/{{CONTEXT}}/g, ctxText);
   const raw = await provider.generate(prompt);

   const prompt = tmpl
     .replace(/{{QUESTION}}/g, question)
+    .replace(/{{ANSWER}}/g, gen.answer || '')
     .replace(/{{CONTEXT}}/g, ctxText);
   const raw = await provider.generate(prompt);

state_of_project.md ADDED Viewed

	@@ -0,0 +1,24 @@

+# State of Project
+## What works
+- Question-first pipeline with JSONL chunk source by default; deterministic chunk IDs and JSONL caches for questions, generations, verifications, and rewards.
+- Providers: Ollama/OpenAI/HTTP plus mock provider; mock pathway enables full pipeline tests without GPUs or ES.
+- Verifier parsing tolerates distributor format (`SCORE` as number or `PASS`/`FAIL` with noisy prefixes); caching and retry logic in place.
+- Tests: 42 passing (retrieval mock/real, generator, verifier, reward, pipeline behaviour, cache, full mock pipeline).
+- CLI defaults: verbose on, question-first, JSONL chunks; chunk/question limits respected.
+## What needs attention
+- Real pipeline currently fails at question generation when Ollama/question model is unreachable; run requires a live Ollama with the specified model pulled.
+- Reward is still mocked in the recent run; swap to real reward provider/model when available.
+- Verifier prompt must stay distributor-provided; parsing is tolerant but malformed outputs still log raw text in verbose mode.
+- Deprecation warning from `punycode` (Node) shows during tests; benign but noisy.
+## Risks
+- Long generator outputs can inflate verifier context; may need truncation or smaller verifier model to avoid context overruns.
+- Cache growth: JSONL caches can grow large; add rotation/compaction if running many cycles.
+- ES mode defaults to 100 chunk fetch if used; confirm chunk limits when switching from JSONL to ES.
+## Next steps (suggested)
+- Pull and start question/answer/verifier/reward models on Ollama (or configure OpenAI/HTTP) and re-run a small batch (`--limit`/`--chunk-limit`) to validate end-to-end with real models.
+- Add optional verifier retry when JSON parse fails (1 retry) and cap logged transcripts to reduce noise in verbose runs.
+- Consider a cache inspection/cleanup script for `data/cache/*.jsonl`.

tests/generator_core.test.mjs CHANGED Viewed

@@ -2,7 +2,7 @@ import { describe, it, expect, vi } from 'vitest';
 import { runGenerator } from '../src/generator/generator_core.mjs';
 describe('generator_core.mjs (thinking generator)', () => {
-  it('includes question and context in the prompt', async () => {
     const fakeContext = [
       { content: 'First context chunk' },
       { content: 'Second context chunk' },
@@ -16,12 +16,13 @@ describe('generator_core.mjs (thinking generator)', () => {
         expect(prompt).toContain('First context chunk');
         expect(prompt).toContain('Second context chunk');
-        // Return some simple thinking-style output
-        return `<think>
-I consider the meaning of love using only the context.
-</think>
-Love is the recognition of shared being.
-`;
       }),
     };
@@ -30,9 +31,10 @@ Love is the recognition of shared being.
     expect(provider.generate).toHaveBeenCalledOnce();
     expect(result.question).toBe('What is love?');
     expect(result.context).toHaveLength(2);
-    expect(result.raw).toContain('<think>');
     expect(result.answer).toBe('Love is the recognition of shared being.');
-    expect(result.thought).toContain('consider the meaning of love');
   });
   it('extracts thought and answer correctly when <think> block is present', async () => {
@@ -78,8 +80,26 @@ The final answer derived from the context.`;
     );
     expect(result.raw).toBe('Just a direct answer with no visible reasoning.');
-    // No think tags means thought=null and answer = full output
     expect(result.thought).toBeNull();
     expect(result.answer).toBe('Just a direct answer with no visible reasoning.');
   });
 });

 import { runGenerator } from '../src/generator/generator_core.mjs';
 describe('generator_core.mjs (thinking generator)', () => {
+  it('includes question and context in the prompt and parses JSON output', async () => {
     const fakeContext = [
       { content: 'First context chunk' },
       { content: 'Second context chunk' },
         expect(prompt).toContain('First context chunk');
         expect(prompt).toContain('Second context chunk');
+        // Return JSON output
+        return JSON.stringify({
+          reasoning: ['step A', 'step B'],
+          answer: 'Love is the recognition of shared being.',
+          confidence: 0.92,
+          evidence: ['quote (para #1)'],
+        });
       }),
     };
     expect(provider.generate).toHaveBeenCalledOnce();
     expect(result.question).toBe('What is love?');
     expect(result.context).toHaveLength(2);
+    expect(result.raw).toContain('step A');
     expect(result.answer).toBe('Love is the recognition of shared being.');
+    expect(result.thought).toContain('step A');
+    expect(result.confidence).toBeCloseTo(0.92);
   });
   it('extracts thought and answer correctly when <think> block is present', async () => {
     );
     expect(result.raw).toBe('Just a direct answer with no visible reasoning.');
+    // No JSON or think tags means thought=null and answer = full output
     expect(result.thought).toBeNull();
     expect(result.answer).toBe('Just a direct answer with no visible reasoning.');
   });
+  it('parses Qwen answer block and preserves thinking object', async () => {
+    const fakeContext = [{ content: 'ctx' }];
+    const provider = {
+      generate: vi.fn(async () => ({
+        response: `<|thought|>step1<|end_of_thought|>\n<|answer|>\nConfidence: High\nAnswer: Supported answer\nEvidence: ["quote1 (loc1)", "quote2 (loc2)"]\nLimitations: None\n<|end_of_answer|>`,
+        thinking: { steps: ['t1', 't2'] },
+      })),
+    };
+    const result = await runGenerator('Test?', fakeContext, provider);
+    expect(result.thought).toEqual({ steps: ['t1', 't2'] });
+    expect(result.answer).toBe('Supported answer');
+    expect(result.confidence).toBe('High');
+    expect(result.evidence).toEqual(['quote1 (loc1)', 'quote2 (loc2)']);
+    expect(result.limitations).toBe('None');
+  });
 });

tests/gold_preview.test.mjs ADDED Viewed

	@@ -0,0 +1,65 @@

+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import fs from 'fs/promises';
+import path from 'path';
+import os from 'os';
+import { fileURLToPath } from 'url';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const PROJECT_ROOT = path.join(__dirname, '..');
+const sampleGold = [
+  JSON.stringify({
+    question: 'What is the meaning of life?',
+    sample: { answer: '42', raw: '42' },
+    context: [{ id: 'c1', content: 'ctx content' }],
+    verifier: { ok: true, score: 0.9 },
+    reward: { score: 0.8 },
+  }),
+  JSON.stringify({
+    question: 'Q1?',
+    sample: { answer: 'a'.repeat(50) },
+    context: [{ id: 'c2', content: 'ctx content 2' }],
+  }),
+].join('\n');
+describe('scripts/gold_preview.mjs', () => {
+  let tmpFile;
+  const origArgv = process.argv.slice();
+  beforeEach(async () => {
+    const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'gold-prev-'));
+    tmpFile = path.join(tmpDir, 'gold.jsonl');
+    await fs.writeFile(tmpFile, sampleGold, 'utf8');
+  });
+  afterEach(async () => {
+    process.argv = origArgv.slice();
+    if (tmpFile) {
+      await fs.rm(path.dirname(tmpFile), { recursive: true, force: true }).catch(() => {});
+    }
+  });
+  it('respects --limit', async () => {
+    process.argv = ['node', 'gold_preview.mjs', '--file', tmpFile, '--limit', '1'];
+    const { capturePreview } = await import('../scripts/gold_preview.mjs');
+    const output = await capturePreview();
+    const lines = output.split('\n').filter(Boolean);
+    expect(lines.some((l) => l.startsWith('#2'))).toBe(false);
+  });
+  it('respects --max-answer truncation', async () => {
+    process.argv = ['node', 'gold_preview.mjs', '--file', tmpFile, '--max-answer', '10'];
+    const { capturePreview } = await import('../scripts/gold_preview.mjs');
+    const output = await capturePreview();
+    expect(output).toMatch(/A: a{10}… \[\+40 chars\]/);
+  });
+  it('shows full when --full is set', async () => {
+    process.argv = ['node', 'gold_preview.mjs', '--file', tmpFile, '--full', '--limit', '1'];
+    const { capturePreview } = await import('../scripts/gold_preview.mjs');
+    const output = await capturePreview();
+    expect(output).toMatch(/A: 42/);
+    expect(output).not.toMatch(/\[\+\d+ chars\]/);
+  });
+});

tests/regenerate_gold_from_cache.test.mjs ADDED Viewed

	@@ -0,0 +1,112 @@

+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import fs from 'fs/promises';
+import path from 'path';
+import os from 'os';
+import { fileURLToPath, pathToFileURL } from 'url';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const PROJECT_ROOT = path.join(__dirname, '..');
+describe('scripts/regenerate_gold_from_cache.mjs', () => {
+  let tmpDir;
+  let cacheDir;
+  let goldPath;
+  let ragPath;
+  const origEnv = { ...process.env };
+  beforeEach(async () => {
+    tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'regen-cache-'));
+    cacheDir = path.join(tmpDir, 'cache');
+    goldPath = path.join(tmpDir, 'pipeline_gold.jsonl');
+    ragPath = path.join(tmpDir, 'rag_chunks.jsonl');
+    process.env.PIPELINE_CACHE_DIR = cacheDir;
+    process.env.GOLD_PATH = goldPath;
+    process.env.RAG_CHUNKS_PATH = ragPath;
+    await fs.mkdir(cacheDir, { recursive: true });
+    await fs.mkdir(path.dirname(goldPath), { recursive: true });
+    await fs.mkdir(path.dirname(ragPath), { recursive: true });
+  });
+  afterEach(async () => {
+    process.env = { ...origEnv };
+    await fs.rm(tmpDir, { recursive: true, force: true }).catch(() => {});
+  });
+  it('reconstructs gold from cache with reward/verifier ok', async () => {
+    // write chunks
+    const chunk = { id: 'c1', content: 'chunk content', source: { meta: 1 } };
+    await fs.writeFile(ragPath, JSON.stringify(chunk) + '\n', 'utf8');
+    // questions cache
+    await fs.writeFile(
+      path.join(cacheDir, 'questions.jsonl'),
+      JSON.stringify({
+        chunk_id: 'c1',
+        questions: ['What is X?'],
+        question_ids: ['q1'],
+        ts: Date.now(),
+      }) + '\n',
+      'utf8',
+    );
+    // generations cache
+    await fs.writeFile(
+      path.join(cacheDir, 'generations.jsonl'),
+      JSON.stringify({
+        chunk_id: 'c1',
+        question_id: 'q1',
+        gen_id: 'g1',
+        answer: 'Answer text',
+        raw: 'Answer text',
+        ts: Date.now(),
+      }) + '\n',
+      'utf8',
+    );
+    // verification cache (ok)
+    await fs.writeFile(
+      path.join(cacheDir, 'verifications.jsonl'),
+      JSON.stringify({
+        chunk_id: 'c1',
+        question_id: 'q1',
+        gen_id: 'g1',
+        ok: true,
+        score: 'PASS',
+        raw: '...',
+        ts: Date.now(),
+      }) + '\n',
+      'utf8',
+    );
+    // reward cache (ok)
+    await fs.writeFile(
+      path.join(cacheDir, 'rewards.jsonl'),
+      JSON.stringify({
+        chunk_id: 'c1',
+        question_id: 'q1',
+        gen_id: 'g1',
+        ok: true,
+        score: 0.9,
+        raw: '0.9',
+        ts: Date.now(),
+      }) + '\n',
+      'utf8',
+    );
+    const mod = await import(pathToFileURL(path.join(PROJECT_ROOT, 'scripts', 'regenerate_gold_from_cache.mjs')));
+    await mod.main();
+    const out = await fs.readFile(goldPath, 'utf8');
+    const lines = out.split('\n').filter(Boolean);
+    expect(lines).toHaveLength(1);
+    const rec = JSON.parse(lines[0]);
+    expect(rec.question).toBe('What is X?');
+    expect(rec.sample.answer).toBe('Answer text');
+    expect(rec.verifier.ok).toBe(true);
+    expect(rec.reward.score).toBe(0.9);
+  });
+});