updated to fit verifier

Browse files

Files changed (8) hide show

prompts/verifier_prompt.txt +15 -15
src/pipeline/cache.mjs +1 -0
src/pipeline/step.mjs +3 -1
src/verifier/verifier_core.mjs +30 -7
tests/pipeline.full.mock.test.mjs +22 -0
tests/pipeline.mock.test.mjs +4 -5
tests/pipeline_behaviour.test.mjs +10 -4
tests/verifier_core.test.mjs +17 -8

prompts/verifier_prompt.txt CHANGED Viewed

@@ -1,20 +1,20 @@
-You are a grounding verifier. Given a QUESTION, CONTEXT (from source text), and an ANSWER, decide if the answer is fully supported, faithful in tone, and free of hallucinations or unsafe content.
-Checklist:
-- Grounding: every claim is supported by CONTEXT; no invented facts or sources.
-- Relevance: directly addresses QUESTION; no topic drift or new questions.
-- Safety/Tone: respectful, calm, no instructions to perform harmful actions; aligns with confederation/teaching tone.
-- Quality: clear and coherent; no placeholders or TODOs.
-Respond on two lines:
-1) "yes" or "no" (lowercase) — "yes" only if all checklist items pass.
-2) One short justification, citing the key snippet(s) from CONTEXT in quotes.
-QUESTION:
-{{QUESTION}}
-CONTEXT:
-{{CONTEXT}}
-ANSWER:
-{{ANSWER}}

+PROMPT = """
+Given the following QUESTION, DOCUMENT and ANSWER you must analyze the provided answer and determine whether it is faithful to the contents of the DOCUMENT. The ANSWER must not offer new information beyond the context provided in the DOCUMENT. The ANSWER also must not contradict information provided in the DOCUMENT. Output your final verdict by strictly following this format: "PASS" if the answer is faithful to the DOCUMENT and "FAIL" if the answer is not faithful to the DOCUMENT. Show your reasoning.
+--
+QUESTION (THIS DOES NOT COUNT AS BACKGROUND INFORMATION):
+{question}
+--
+DOCUMENT:
+{context}
+--
+ANSWER:
+{answer}
+--
+Your output should be in JSON FORMAT with the keys "REASONING" and "SCORE":
+{{"REASONING": <your reasoning as bullet points>, "SCORE": <your final score>}}
+"""

src/pipeline/cache.mjs CHANGED Viewed

@@ -132,6 +132,7 @@ export async function saveVerification(chunkId, qId, genId, ver, meta = {}) {
     question_id: qId,
     gen_id: genId,
     ok: ver.ok === true,
     raw: ver.raw,
     provider: meta.provider,
     model: meta.model,

     question_id: qId,
     gen_id: genId,
     ok: ver.ok === true,
+    score: ver.score,
     raw: ver.raw,
     provider: meta.provider,
     model: meta.model,

src/pipeline/step.mjs CHANGED Viewed

@@ -198,7 +198,9 @@ export async function runPipelineStep({
     if (verbose) {
       log('   [verifier] ok=' + (ver?.ok === true));
-      log('   ' + preview(ver?.raw ?? '', 200).replace(/\n/g, '\n   '));
     }
   } catch (e) {
     const msg = e?.message || String(e);

     if (verbose) {
       log('   [verifier] ok=' + (ver?.ok === true));
+      const raw = ver?.raw ?? '';
+      log('   [verifier] raw transcript:');
+      log('   ' + raw.replace(/\n/g, '\n   '));
     }
   } catch (e) {
     const msg = e?.message || String(e);

src/verifier/verifier_core.mjs CHANGED Viewed

@@ -21,16 +21,39 @@ export async function runVerifier({ question, context, gen }, provider) {
     .join("\n\n---\n\n");
   const prompt = tmpl
-    .replace(/{{QUESTION}}/g, question)
-    .replace(/{{ANSWER}}/g, gen.answer || '')
-    .replace(/{{CONTEXT}}/g, ctxText);
   const raw = await provider.generate(prompt);
-  const first = raw.split("\n")[0].trim();
-  const ok = /^yes\b/i.test(first);
-  return { raw, ok };
 }
 export default { runVerifier };

     .join("\n\n---\n\n");
   const prompt = tmpl
+    .replace(/{question}/g, question)
+    .replace(/{answer}/g, gen.answer || '')
+    .replace(/{context}/g, ctxText);
   const raw = await provider.generate(prompt);
+  // Parse strict JSON format:
+  // {"REASONING": <bullet points>, "SCORE": <final score>}
+  let ok = false;
+  let score = null;
+  let reasoning = null;
+  let error = null;
+  try {
+    const parsed = JSON.parse(raw);
+    reasoning = parsed?.REASONING ?? null;
+    if (parsed && Object.prototype.hasOwnProperty.call(parsed, 'SCORE')) {
+      const s = parsed.SCORE;
+      if (typeof s === 'number') {
+        score = s;
+      } else if (typeof s === 'string') {
+        const num = Number(s);
+        if (Number.isFinite(num)) score = num;
+      }
+      if (score != null) {
+        ok = score >= 0.5;
+      }
+    }
+  } catch {
+    error = 'invalid_json';
+  }
+  return { raw, ok, score, reasoning, error };
 }
 export default { runVerifier };

tests/pipeline.full.mock.test.mjs CHANGED Viewed

@@ -45,6 +45,28 @@ describe('full pipeline (mock providers)', () => {
       fetchChunksFromIndex: vi.fn(),
     }));
     const { runPipelineBatch } = await import('../src/pipeline/batch.mjs');
     const outPath = path.join(os.tmpdir(), `mock-pipeline-${Date.now()}.jsonl`);

       fetchChunksFromIndex: vi.fn(),
     }));
+    // Mock verifier to return JSON with SCORE
+    vi.doMock('../src/providers/provider.mjs', () => ({
+      loadProviderFor: (stage) => {
+        if (stage === 'verifier') {
+          return {
+            generate: async () =>
+              JSON.stringify({
+                REASONING: ['supported'],
+                SCORE: 0.9,
+              }),
+          };
+        }
+        if (stage === 'reward') return { generate: async () => '0.9 good' };
+        if (stage === 'question')
+          return {
+            generate: async () =>
+              JSON.stringify({ questions: ['Q1?', 'Q2?', 'Q3?'] }),
+          };
+        return { generate: async () => '<think>t</think>answer' };
+      },
+    }));
     const { runPipelineBatch } = await import('../src/pipeline/batch.mjs');
     const outPath = path.join(os.tmpdir(), `mock-pipeline-${Date.now()}.jsonl`);

tests/pipeline.mock.test.mjs CHANGED Viewed

@@ -17,8 +17,7 @@ vi.mock('../src/providers/provider.mjs', () => {
             return 'mocked';
           }
           if (stage === 'verifier') {
-            // verifier returns a "yes" first line so runVerifier.ok = true
-            return 'yes\nmock verifier justification';
           }
           if (stage === 'reward') {
             // reward returns a score in [0,1]
@@ -58,9 +57,9 @@ describe('runPipelineStep (mocked providers)', () => {
     // generator output made it through
     expect(result.gen.answer).toBe('mocked');
-    // verifier + reward both say OK
-    expect(result.ver.ok).toBe(true);
-    expect(result.rew.ok).toBe(true);
     // NEW CONTRACT:
     // even though retrieval returns 2 chunks, step.mjs enforces a single-chunk context

             return 'mocked';
           }
           if (stage === 'verifier') {
+            return JSON.stringify({ REASONING: ['ok'], SCORE: 0.9 });
           }
           if (stage === 'reward') {
             // reward returns a score in [0,1]
     // generator output made it through
     expect(result.gen.answer).toBe('mocked');
+  // verifier + reward both say OK
+  expect(result.ver.ok).toBe(true);
+  expect(result.rew.ok).toBe(true);
     // NEW CONTRACT:
     // even though retrieval returns 2 chunks, step.mjs enforces a single-chunk context

tests/pipeline_behaviour.test.mjs CHANGED Viewed

@@ -94,10 +94,16 @@ describe('runPipelineBatch question cap', () => {
     // Mock question provider + retrieval
     vi.doMock('../src/providers/provider.mjs', () => ({
-      loadProviderFor: (stage) =>
-        stage === 'question'
-          ? { generate: async () => questionsPerChunk }
-          : { generate: async () => '' },
     }));
     // Force question generator to return a fixed list to avoid parser variability

     // Mock question provider + retrieval
     vi.doMock('../src/providers/provider.mjs', () => ({
+      loadProviderFor: (stage) => {
+        if (stage === 'question') return { generate: async () => questionsPerChunk };
+        if (stage === 'verifier')
+          return {
+            generate: async () =>
+              JSON.stringify({ REASONING: ['ok'], SCORE: 1 }),
+          };
+        if (stage === 'reward') return { generate: async () => '0.9 good' };
+        return { generate: async () => '' };
+      },
     }));
     // Force question generator to return a fixed list to avoid parser variability

tests/verifier_core.test.mjs CHANGED Viewed

@@ -3,7 +3,7 @@ import { describe, it, expect, vi } from 'vitest';
 import { runVerifier } from '../src/verifier/verifier_core.mjs';
 describe('verifier_core.mjs', () => {
-  it('returns ok=true when first line is YES', async () => {
     const sample = {
       question: 'What is love?',
       context: [{ content: 'ctx' }],
@@ -11,7 +11,9 @@ describe('verifier_core.mjs', () => {
     };
     const provider = {
-      generate: vi.fn(async () => 'YES\nLooks good.'),
     };
     const result = await runVerifier(
@@ -25,10 +27,11 @@ describe('verifier_core.mjs', () => {
     expect(provider.generate).toHaveBeenCalledOnce();
     expect(result.ok).toBe(true);
-    expect(result.raw.startsWith('YES')).toBe(true);
   });
-  it('returns ok=false when first line is NO', async () => {
     const sample = {
       question: 'What is love?',
       context: [{ content: 'ctx' }],
@@ -36,7 +39,9 @@ describe('verifier_core.mjs', () => {
     };
     const provider = {
-      generate: vi.fn(async () => 'NO\nIncorrect interpretation.'),
     };
     const result = await runVerifier(
@@ -49,10 +54,10 @@ describe('verifier_core.mjs', () => {
     );
     expect(result.ok).toBe(false);
-    expect(result.raw.startsWith('NO')).toBe(true);
   });
-  it('accepts lowercase/whitespace yes on first line', async () => {
     const sample = {
       question: 'What is love?',
       context: [{ content: 'ctx' }],
@@ -60,7 +65,9 @@ describe('verifier_core.mjs', () => {
     };
     const provider = {
-      generate: vi.fn(async () => 'yes, this is fine\nMore details…'),
     };
     const result = await runVerifier(
@@ -73,5 +80,7 @@ describe('verifier_core.mjs', () => {
     );
     expect(result.ok).toBe(true);
   });
 });

 import { runVerifier } from '../src/verifier/verifier_core.mjs';
 describe('verifier_core.mjs', () => {
+  it('returns ok=true when SCORE >= 0.5 JSON', async () => {
     const sample = {
       question: 'What is love?',
       context: [{ content: 'ctx' }],
     };
     const provider = {
+      generate: vi.fn(async () =>
+        JSON.stringify({ REASONING: ['ok'], SCORE: 0.9 }),
+      ),
     };
     const result = await runVerifier(
     expect(provider.generate).toHaveBeenCalledOnce();
     expect(result.ok).toBe(true);
+    expect(result.score).toBe(0.9);
+    expect(result.reasoning).toEqual(['ok']);
   });
+  it('returns ok=false when SCORE < 0.5 JSON', async () => {
     const sample = {
       question: 'What is love?',
       context: [{ content: 'ctx' }],
     };
     const provider = {
+      generate: vi.fn(async () =>
+        JSON.stringify({ REASONING: ['bad'], SCORE: 0.1 }),
+      ),
     };
     const result = await runVerifier(
     );
     expect(result.ok).toBe(false);
+    expect(result.score).toBe(0.1);
   });
+  it('handles string SCORE and preserves reasoning', async () => {
     const sample = {
       question: 'What is love?',
       context: [{ content: 'ctx' }],
     };
     const provider = {
+      generate: vi.fn(async () =>
+        JSON.stringify({ REASONING: ['fine'], SCORE: '0.7' }),
+      ),
     };
     const result = await runVerifier(
     );
     expect(result.ok).toBe(true);
+    expect(result.score).toBe(0.7);
+    expect(result.reasoning).toEqual(['fine']);
   });
 });