updated to fit verifier
Browse files- prompts/verifier_prompt.txt +15 -15
- src/pipeline/cache.mjs +1 -0
- src/pipeline/step.mjs +3 -1
- src/verifier/verifier_core.mjs +30 -7
- tests/pipeline.full.mock.test.mjs +22 -0
- tests/pipeline.mock.test.mjs +4 -5
- tests/pipeline_behaviour.test.mjs +10 -4
- tests/verifier_core.test.mjs +17 -8
prompts/verifier_prompt.txt
CHANGED
|
@@ -1,20 +1,20 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
- Safety/Tone: respectful, calm, no instructions to perform harmful actions; aligns with confederation/teaching tone.
|
| 7 |
-
- Quality: clear and coherent; no placeholders or TODOs.
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
{{CONTEXT}}
|
| 18 |
|
| 19 |
-
|
| 20 |
-
{{
|
|
|
|
|
|
| 1 |
+
PROMPT = """
|
| 2 |
+
Given the following QUESTION, DOCUMENT and ANSWER you must analyze the provided answer and determine whether it is faithful to the contents of the DOCUMENT. The ANSWER must not offer new information beyond the context provided in the DOCUMENT. The ANSWER also must not contradict information provided in the DOCUMENT. Output your final verdict by strictly following this format: "PASS" if the answer is faithful to the DOCUMENT and "FAIL" if the answer is not faithful to the DOCUMENT. Show your reasoning.
|
| 3 |
|
| 4 |
+
--
|
| 5 |
+
QUESTION (THIS DOES NOT COUNT AS BACKGROUND INFORMATION):
|
| 6 |
+
{question}
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
--
|
| 9 |
+
DOCUMENT:
|
| 10 |
+
{context}
|
| 11 |
|
| 12 |
+
--
|
| 13 |
+
ANSWER:
|
| 14 |
+
{answer}
|
| 15 |
|
| 16 |
+
--
|
|
|
|
| 17 |
|
| 18 |
+
Your output should be in JSON FORMAT with the keys "REASONING" and "SCORE":
|
| 19 |
+
{{"REASONING": <your reasoning as bullet points>, "SCORE": <your final score>}}
|
| 20 |
+
"""
|
src/pipeline/cache.mjs
CHANGED
|
@@ -132,6 +132,7 @@ export async function saveVerification(chunkId, qId, genId, ver, meta = {}) {
|
|
| 132 |
question_id: qId,
|
| 133 |
gen_id: genId,
|
| 134 |
ok: ver.ok === true,
|
|
|
|
| 135 |
raw: ver.raw,
|
| 136 |
provider: meta.provider,
|
| 137 |
model: meta.model,
|
|
|
|
| 132 |
question_id: qId,
|
| 133 |
gen_id: genId,
|
| 134 |
ok: ver.ok === true,
|
| 135 |
+
score: ver.score,
|
| 136 |
raw: ver.raw,
|
| 137 |
provider: meta.provider,
|
| 138 |
model: meta.model,
|
src/pipeline/step.mjs
CHANGED
|
@@ -198,7 +198,9 @@ export async function runPipelineStep({
|
|
| 198 |
|
| 199 |
if (verbose) {
|
| 200 |
log(' [verifier] ok=' + (ver?.ok === true));
|
| 201 |
-
|
|
|
|
|
|
|
| 202 |
}
|
| 203 |
} catch (e) {
|
| 204 |
const msg = e?.message || String(e);
|
|
|
|
| 198 |
|
| 199 |
if (verbose) {
|
| 200 |
log(' [verifier] ok=' + (ver?.ok === true));
|
| 201 |
+
const raw = ver?.raw ?? '';
|
| 202 |
+
log(' [verifier] raw transcript:');
|
| 203 |
+
log(' ' + raw.replace(/\n/g, '\n '));
|
| 204 |
}
|
| 205 |
} catch (e) {
|
| 206 |
const msg = e?.message || String(e);
|
src/verifier/verifier_core.mjs
CHANGED
|
@@ -21,16 +21,39 @@ export async function runVerifier({ question, context, gen }, provider) {
|
|
| 21 |
.join("\n\n---\n\n");
|
| 22 |
|
| 23 |
const prompt = tmpl
|
| 24 |
-
.replace(/{
|
| 25 |
-
.replace(/{
|
| 26 |
-
.replace(/{
|
| 27 |
|
| 28 |
const raw = await provider.generate(prompt);
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
}
|
| 35 |
|
| 36 |
export default { runVerifier };
|
|
|
|
| 21 |
.join("\n\n---\n\n");
|
| 22 |
|
| 23 |
const prompt = tmpl
|
| 24 |
+
.replace(/{question}/g, question)
|
| 25 |
+
.replace(/{answer}/g, gen.answer || '')
|
| 26 |
+
.replace(/{context}/g, ctxText);
|
| 27 |
|
| 28 |
const raw = await provider.generate(prompt);
|
| 29 |
|
| 30 |
+
// Parse strict JSON format:
|
| 31 |
+
// {"REASONING": <bullet points>, "SCORE": <final score>}
|
| 32 |
+
let ok = false;
|
| 33 |
+
let score = null;
|
| 34 |
+
let reasoning = null;
|
| 35 |
+
let error = null;
|
| 36 |
+
try {
|
| 37 |
+
const parsed = JSON.parse(raw);
|
| 38 |
+
reasoning = parsed?.REASONING ?? null;
|
| 39 |
+
|
| 40 |
+
if (parsed && Object.prototype.hasOwnProperty.call(parsed, 'SCORE')) {
|
| 41 |
+
const s = parsed.SCORE;
|
| 42 |
+
if (typeof s === 'number') {
|
| 43 |
+
score = s;
|
| 44 |
+
} else if (typeof s === 'string') {
|
| 45 |
+
const num = Number(s);
|
| 46 |
+
if (Number.isFinite(num)) score = num;
|
| 47 |
+
}
|
| 48 |
+
if (score != null) {
|
| 49 |
+
ok = score >= 0.5;
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
} catch {
|
| 53 |
+
error = 'invalid_json';
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
return { raw, ok, score, reasoning, error };
|
| 57 |
}
|
| 58 |
|
| 59 |
export default { runVerifier };
|
tests/pipeline.full.mock.test.mjs
CHANGED
|
@@ -45,6 +45,28 @@ describe('full pipeline (mock providers)', () => {
|
|
| 45 |
fetchChunksFromIndex: vi.fn(),
|
| 46 |
}));
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
const { runPipelineBatch } = await import('../src/pipeline/batch.mjs');
|
| 49 |
|
| 50 |
const outPath = path.join(os.tmpdir(), `mock-pipeline-${Date.now()}.jsonl`);
|
|
|
|
| 45 |
fetchChunksFromIndex: vi.fn(),
|
| 46 |
}));
|
| 47 |
|
| 48 |
+
// Mock verifier to return JSON with SCORE
|
| 49 |
+
vi.doMock('../src/providers/provider.mjs', () => ({
|
| 50 |
+
loadProviderFor: (stage) => {
|
| 51 |
+
if (stage === 'verifier') {
|
| 52 |
+
return {
|
| 53 |
+
generate: async () =>
|
| 54 |
+
JSON.stringify({
|
| 55 |
+
REASONING: ['supported'],
|
| 56 |
+
SCORE: 0.9,
|
| 57 |
+
}),
|
| 58 |
+
};
|
| 59 |
+
}
|
| 60 |
+
if (stage === 'reward') return { generate: async () => '0.9 good' };
|
| 61 |
+
if (stage === 'question')
|
| 62 |
+
return {
|
| 63 |
+
generate: async () =>
|
| 64 |
+
JSON.stringify({ questions: ['Q1?', 'Q2?', 'Q3?'] }),
|
| 65 |
+
};
|
| 66 |
+
return { generate: async () => '<think>t</think>answer' };
|
| 67 |
+
},
|
| 68 |
+
}));
|
| 69 |
+
|
| 70 |
const { runPipelineBatch } = await import('../src/pipeline/batch.mjs');
|
| 71 |
|
| 72 |
const outPath = path.join(os.tmpdir(), `mock-pipeline-${Date.now()}.jsonl`);
|
tests/pipeline.mock.test.mjs
CHANGED
|
@@ -17,8 +17,7 @@ vi.mock('../src/providers/provider.mjs', () => {
|
|
| 17 |
return 'mocked';
|
| 18 |
}
|
| 19 |
if (stage === 'verifier') {
|
| 20 |
-
|
| 21 |
-
return 'yes\nmock verifier justification';
|
| 22 |
}
|
| 23 |
if (stage === 'reward') {
|
| 24 |
// reward returns a score in [0,1]
|
|
@@ -58,9 +57,9 @@ describe('runPipelineStep (mocked providers)', () => {
|
|
| 58 |
// generator output made it through
|
| 59 |
expect(result.gen.answer).toBe('mocked');
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
|
| 65 |
// NEW CONTRACT:
|
| 66 |
// even though retrieval returns 2 chunks, step.mjs enforces a single-chunk context
|
|
|
|
| 17 |
return 'mocked';
|
| 18 |
}
|
| 19 |
if (stage === 'verifier') {
|
| 20 |
+
return JSON.stringify({ REASONING: ['ok'], SCORE: 0.9 });
|
|
|
|
| 21 |
}
|
| 22 |
if (stage === 'reward') {
|
| 23 |
// reward returns a score in [0,1]
|
|
|
|
| 57 |
// generator output made it through
|
| 58 |
expect(result.gen.answer).toBe('mocked');
|
| 59 |
|
| 60 |
+
// verifier + reward both say OK
|
| 61 |
+
expect(result.ver.ok).toBe(true);
|
| 62 |
+
expect(result.rew.ok).toBe(true);
|
| 63 |
|
| 64 |
// NEW CONTRACT:
|
| 65 |
// even though retrieval returns 2 chunks, step.mjs enforces a single-chunk context
|
tests/pipeline_behaviour.test.mjs
CHANGED
|
@@ -94,10 +94,16 @@ describe('runPipelineBatch question cap', () => {
|
|
| 94 |
|
| 95 |
// Mock question provider + retrieval
|
| 96 |
vi.doMock('../src/providers/provider.mjs', () => ({
|
| 97 |
-
loadProviderFor: (stage) =>
|
| 98 |
-
stage === 'question'
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
}));
|
| 102 |
|
| 103 |
// Force question generator to return a fixed list to avoid parser variability
|
|
|
|
| 94 |
|
| 95 |
// Mock question provider + retrieval
|
| 96 |
vi.doMock('../src/providers/provider.mjs', () => ({
|
| 97 |
+
loadProviderFor: (stage) => {
|
| 98 |
+
if (stage === 'question') return { generate: async () => questionsPerChunk };
|
| 99 |
+
if (stage === 'verifier')
|
| 100 |
+
return {
|
| 101 |
+
generate: async () =>
|
| 102 |
+
JSON.stringify({ REASONING: ['ok'], SCORE: 1 }),
|
| 103 |
+
};
|
| 104 |
+
if (stage === 'reward') return { generate: async () => '0.9 good' };
|
| 105 |
+
return { generate: async () => '' };
|
| 106 |
+
},
|
| 107 |
}));
|
| 108 |
|
| 109 |
// Force question generator to return a fixed list to avoid parser variability
|
tests/verifier_core.test.mjs
CHANGED
|
@@ -3,7 +3,7 @@ import { describe, it, expect, vi } from 'vitest';
|
|
| 3 |
import { runVerifier } from '../src/verifier/verifier_core.mjs';
|
| 4 |
|
| 5 |
describe('verifier_core.mjs', () => {
|
| 6 |
-
it('returns ok=true when
|
| 7 |
const sample = {
|
| 8 |
question: 'What is love?',
|
| 9 |
context: [{ content: 'ctx' }],
|
|
@@ -11,7 +11,9 @@ describe('verifier_core.mjs', () => {
|
|
| 11 |
};
|
| 12 |
|
| 13 |
const provider = {
|
| 14 |
-
generate: vi.fn(async () =>
|
|
|
|
|
|
|
| 15 |
};
|
| 16 |
|
| 17 |
const result = await runVerifier(
|
|
@@ -25,10 +27,11 @@ describe('verifier_core.mjs', () => {
|
|
| 25 |
|
| 26 |
expect(provider.generate).toHaveBeenCalledOnce();
|
| 27 |
expect(result.ok).toBe(true);
|
| 28 |
-
expect(result.
|
|
|
|
| 29 |
});
|
| 30 |
|
| 31 |
-
it('returns ok=false when
|
| 32 |
const sample = {
|
| 33 |
question: 'What is love?',
|
| 34 |
context: [{ content: 'ctx' }],
|
|
@@ -36,7 +39,9 @@ describe('verifier_core.mjs', () => {
|
|
| 36 |
};
|
| 37 |
|
| 38 |
const provider = {
|
| 39 |
-
generate: vi.fn(async () =>
|
|
|
|
|
|
|
| 40 |
};
|
| 41 |
|
| 42 |
const result = await runVerifier(
|
|
@@ -49,10 +54,10 @@ describe('verifier_core.mjs', () => {
|
|
| 49 |
);
|
| 50 |
|
| 51 |
expect(result.ok).toBe(false);
|
| 52 |
-
expect(result.
|
| 53 |
});
|
| 54 |
|
| 55 |
-
it('
|
| 56 |
const sample = {
|
| 57 |
question: 'What is love?',
|
| 58 |
context: [{ content: 'ctx' }],
|
|
@@ -60,7 +65,9 @@ describe('verifier_core.mjs', () => {
|
|
| 60 |
};
|
| 61 |
|
| 62 |
const provider = {
|
| 63 |
-
generate: vi.fn(async () =>
|
|
|
|
|
|
|
| 64 |
};
|
| 65 |
|
| 66 |
const result = await runVerifier(
|
|
@@ -73,5 +80,7 @@ describe('verifier_core.mjs', () => {
|
|
| 73 |
);
|
| 74 |
|
| 75 |
expect(result.ok).toBe(true);
|
|
|
|
|
|
|
| 76 |
});
|
| 77 |
});
|
|
|
|
| 3 |
import { runVerifier } from '../src/verifier/verifier_core.mjs';
|
| 4 |
|
| 5 |
describe('verifier_core.mjs', () => {
|
| 6 |
+
it('returns ok=true when SCORE >= 0.5 JSON', async () => {
|
| 7 |
const sample = {
|
| 8 |
question: 'What is love?',
|
| 9 |
context: [{ content: 'ctx' }],
|
|
|
|
| 11 |
};
|
| 12 |
|
| 13 |
const provider = {
|
| 14 |
+
generate: vi.fn(async () =>
|
| 15 |
+
JSON.stringify({ REASONING: ['ok'], SCORE: 0.9 }),
|
| 16 |
+
),
|
| 17 |
};
|
| 18 |
|
| 19 |
const result = await runVerifier(
|
|
|
|
| 27 |
|
| 28 |
expect(provider.generate).toHaveBeenCalledOnce();
|
| 29 |
expect(result.ok).toBe(true);
|
| 30 |
+
expect(result.score).toBe(0.9);
|
| 31 |
+
expect(result.reasoning).toEqual(['ok']);
|
| 32 |
});
|
| 33 |
|
| 34 |
+
it('returns ok=false when SCORE < 0.5 JSON', async () => {
|
| 35 |
const sample = {
|
| 36 |
question: 'What is love?',
|
| 37 |
context: [{ content: 'ctx' }],
|
|
|
|
| 39 |
};
|
| 40 |
|
| 41 |
const provider = {
|
| 42 |
+
generate: vi.fn(async () =>
|
| 43 |
+
JSON.stringify({ REASONING: ['bad'], SCORE: 0.1 }),
|
| 44 |
+
),
|
| 45 |
};
|
| 46 |
|
| 47 |
const result = await runVerifier(
|
|
|
|
| 54 |
);
|
| 55 |
|
| 56 |
expect(result.ok).toBe(false);
|
| 57 |
+
expect(result.score).toBe(0.1);
|
| 58 |
});
|
| 59 |
|
| 60 |
+
it('handles string SCORE and preserves reasoning', async () => {
|
| 61 |
const sample = {
|
| 62 |
question: 'What is love?',
|
| 63 |
context: [{ content: 'ctx' }],
|
|
|
|
| 65 |
};
|
| 66 |
|
| 67 |
const provider = {
|
| 68 |
+
generate: vi.fn(async () =>
|
| 69 |
+
JSON.stringify({ REASONING: ['fine'], SCORE: '0.7' }),
|
| 70 |
+
),
|
| 71 |
};
|
| 72 |
|
| 73 |
const result = await runVerifier(
|
|
|
|
| 80 |
);
|
| 81 |
|
| 82 |
expect(result.ok).toBe(true);
|
| 83 |
+
expect(result.score).toBe(0.7);
|
| 84 |
+
expect(result.reasoning).toEqual(['fine']);
|
| 85 |
});
|
| 86 |
});
|