htaf commited on
Commit
5e7271c
·
1 Parent(s): a437c71

updated to fit verifier

Browse files
prompts/verifier_prompt.txt CHANGED
@@ -1,20 +1,20 @@
1
- You are a grounding verifier. Given a QUESTION, CONTEXT (from source text), and an ANSWER, decide if the answer is fully supported, faithful in tone, and free of hallucinations or unsafe content.
 
2
 
3
- Checklist:
4
- - Grounding: every claim is supported by CONTEXT; no invented facts or sources.
5
- - Relevance: directly addresses QUESTION; no topic drift or new questions.
6
- - Safety/Tone: respectful, calm, no instructions to perform harmful actions; aligns with confederation/teaching tone.
7
- - Quality: clear and coherent; no placeholders or TODOs.
8
 
9
- Respond on two lines:
10
- 1) "yes" or "no" (lowercase) — "yes" only if all checklist items pass.
11
- 2) One short justification, citing the key snippet(s) from CONTEXT in quotes.
12
 
13
- QUESTION:
14
- {{QUESTION}}
 
15
 
16
- CONTEXT:
17
- {{CONTEXT}}
18
 
19
- ANSWER:
20
- {{ANSWER}}
 
 
1
+ PROMPT = """
2
+ Given the following QUESTION, DOCUMENT and ANSWER you must analyze the provided answer and determine whether it is faithful to the contents of the DOCUMENT. The ANSWER must not offer new information beyond the context provided in the DOCUMENT. The ANSWER also must not contradict information provided in the DOCUMENT. Output your final verdict by strictly following this format: "PASS" if the answer is faithful to the DOCUMENT and "FAIL" if the answer is not faithful to the DOCUMENT. Show your reasoning.
3
 
4
+ --
5
+ QUESTION (THIS DOES NOT COUNT AS BACKGROUND INFORMATION):
6
+ {question}
 
 
7
 
8
+ --
9
+ DOCUMENT:
10
+ {context}
11
 
12
+ --
13
+ ANSWER:
14
+ {answer}
15
 
16
+ --
 
17
 
18
+ Your output should be in JSON FORMAT with the keys "REASONING" and "SCORE":
19
+ {{"REASONING": <your reasoning as bullet points>, "SCORE": <your final score>}}
20
+ """
src/pipeline/cache.mjs CHANGED
@@ -132,6 +132,7 @@ export async function saveVerification(chunkId, qId, genId, ver, meta = {}) {
132
  question_id: qId,
133
  gen_id: genId,
134
  ok: ver.ok === true,
 
135
  raw: ver.raw,
136
  provider: meta.provider,
137
  model: meta.model,
 
132
  question_id: qId,
133
  gen_id: genId,
134
  ok: ver.ok === true,
135
+ score: ver.score,
136
  raw: ver.raw,
137
  provider: meta.provider,
138
  model: meta.model,
src/pipeline/step.mjs CHANGED
@@ -198,7 +198,9 @@ export async function runPipelineStep({
198
 
199
  if (verbose) {
200
  log(' [verifier] ok=' + (ver?.ok === true));
201
- log(' ' + preview(ver?.raw ?? '', 200).replace(/\n/g, '\n '));
 
 
202
  }
203
  } catch (e) {
204
  const msg = e?.message || String(e);
 
198
 
199
  if (verbose) {
200
  log(' [verifier] ok=' + (ver?.ok === true));
201
+ const raw = ver?.raw ?? '';
202
+ log(' [verifier] raw transcript:');
203
+ log(' ' + raw.replace(/\n/g, '\n '));
204
  }
205
  } catch (e) {
206
  const msg = e?.message || String(e);
src/verifier/verifier_core.mjs CHANGED
@@ -21,16 +21,39 @@ export async function runVerifier({ question, context, gen }, provider) {
21
  .join("\n\n---\n\n");
22
 
23
  const prompt = tmpl
24
- .replace(/{{QUESTION}}/g, question)
25
- .replace(/{{ANSWER}}/g, gen.answer || '')
26
- .replace(/{{CONTEXT}}/g, ctxText);
27
 
28
  const raw = await provider.generate(prompt);
29
 
30
- const first = raw.split("\n")[0].trim();
31
- const ok = /^yes\b/i.test(first);
32
-
33
- return { raw, ok };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  }
35
 
36
  export default { runVerifier };
 
21
  .join("\n\n---\n\n");
22
 
23
  const prompt = tmpl
24
+ .replace(/{question}/g, question)
25
+ .replace(/{answer}/g, gen.answer || '')
26
+ .replace(/{context}/g, ctxText);
27
 
28
  const raw = await provider.generate(prompt);
29
 
30
+ // Parse strict JSON format:
31
+ // {"REASONING": <bullet points>, "SCORE": <final score>}
32
+ let ok = false;
33
+ let score = null;
34
+ let reasoning = null;
35
+ let error = null;
36
+ try {
37
+ const parsed = JSON.parse(raw);
38
+ reasoning = parsed?.REASONING ?? null;
39
+
40
+ if (parsed && Object.prototype.hasOwnProperty.call(parsed, 'SCORE')) {
41
+ const s = parsed.SCORE;
42
+ if (typeof s === 'number') {
43
+ score = s;
44
+ } else if (typeof s === 'string') {
45
+ const num = Number(s);
46
+ if (Number.isFinite(num)) score = num;
47
+ }
48
+ if (score != null) {
49
+ ok = score >= 0.5;
50
+ }
51
+ }
52
+ } catch {
53
+ error = 'invalid_json';
54
+ }
55
+
56
+ return { raw, ok, score, reasoning, error };
57
  }
58
 
59
  export default { runVerifier };
tests/pipeline.full.mock.test.mjs CHANGED
@@ -45,6 +45,28 @@ describe('full pipeline (mock providers)', () => {
45
  fetchChunksFromIndex: vi.fn(),
46
  }));
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  const { runPipelineBatch } = await import('../src/pipeline/batch.mjs');
49
 
50
  const outPath = path.join(os.tmpdir(), `mock-pipeline-${Date.now()}.jsonl`);
 
45
  fetchChunksFromIndex: vi.fn(),
46
  }));
47
 
48
+ // Mock verifier to return JSON with SCORE
49
+ vi.doMock('../src/providers/provider.mjs', () => ({
50
+ loadProviderFor: (stage) => {
51
+ if (stage === 'verifier') {
52
+ return {
53
+ generate: async () =>
54
+ JSON.stringify({
55
+ REASONING: ['supported'],
56
+ SCORE: 0.9,
57
+ }),
58
+ };
59
+ }
60
+ if (stage === 'reward') return { generate: async () => '0.9 good' };
61
+ if (stage === 'question')
62
+ return {
63
+ generate: async () =>
64
+ JSON.stringify({ questions: ['Q1?', 'Q2?', 'Q3?'] }),
65
+ };
66
+ return { generate: async () => '<think>t</think>answer' };
67
+ },
68
+ }));
69
+
70
  const { runPipelineBatch } = await import('../src/pipeline/batch.mjs');
71
 
72
  const outPath = path.join(os.tmpdir(), `mock-pipeline-${Date.now()}.jsonl`);
tests/pipeline.mock.test.mjs CHANGED
@@ -17,8 +17,7 @@ vi.mock('../src/providers/provider.mjs', () => {
17
  return 'mocked';
18
  }
19
  if (stage === 'verifier') {
20
- // verifier returns a "yes" first line so runVerifier.ok = true
21
- return 'yes\nmock verifier justification';
22
  }
23
  if (stage === 'reward') {
24
  // reward returns a score in [0,1]
@@ -58,9 +57,9 @@ describe('runPipelineStep (mocked providers)', () => {
58
  // generator output made it through
59
  expect(result.gen.answer).toBe('mocked');
60
 
61
- // verifier + reward both say OK
62
- expect(result.ver.ok).toBe(true);
63
- expect(result.rew.ok).toBe(true);
64
 
65
  // NEW CONTRACT:
66
  // even though retrieval returns 2 chunks, step.mjs enforces a single-chunk context
 
17
  return 'mocked';
18
  }
19
  if (stage === 'verifier') {
20
+ return JSON.stringify({ REASONING: ['ok'], SCORE: 0.9 });
 
21
  }
22
  if (stage === 'reward') {
23
  // reward returns a score in [0,1]
 
57
  // generator output made it through
58
  expect(result.gen.answer).toBe('mocked');
59
 
60
+ // verifier + reward both say OK
61
+ expect(result.ver.ok).toBe(true);
62
+ expect(result.rew.ok).toBe(true);
63
 
64
  // NEW CONTRACT:
65
  // even though retrieval returns 2 chunks, step.mjs enforces a single-chunk context
tests/pipeline_behaviour.test.mjs CHANGED
@@ -94,10 +94,16 @@ describe('runPipelineBatch question cap', () => {
94
 
95
  // Mock question provider + retrieval
96
  vi.doMock('../src/providers/provider.mjs', () => ({
97
- loadProviderFor: (stage) =>
98
- stage === 'question'
99
- ? { generate: async () => questionsPerChunk }
100
- : { generate: async () => '' },
 
 
 
 
 
 
101
  }));
102
 
103
  // Force question generator to return a fixed list to avoid parser variability
 
94
 
95
  // Mock question provider + retrieval
96
  vi.doMock('../src/providers/provider.mjs', () => ({
97
+ loadProviderFor: (stage) => {
98
+ if (stage === 'question') return { generate: async () => questionsPerChunk };
99
+ if (stage === 'verifier')
100
+ return {
101
+ generate: async () =>
102
+ JSON.stringify({ REASONING: ['ok'], SCORE: 1 }),
103
+ };
104
+ if (stage === 'reward') return { generate: async () => '0.9 good' };
105
+ return { generate: async () => '' };
106
+ },
107
  }));
108
 
109
  // Force question generator to return a fixed list to avoid parser variability
tests/verifier_core.test.mjs CHANGED
@@ -3,7 +3,7 @@ import { describe, it, expect, vi } from 'vitest';
3
  import { runVerifier } from '../src/verifier/verifier_core.mjs';
4
 
5
  describe('verifier_core.mjs', () => {
6
- it('returns ok=true when first line is YES', async () => {
7
  const sample = {
8
  question: 'What is love?',
9
  context: [{ content: 'ctx' }],
@@ -11,7 +11,9 @@ describe('verifier_core.mjs', () => {
11
  };
12
 
13
  const provider = {
14
- generate: vi.fn(async () => 'YES\nLooks good.'),
 
 
15
  };
16
 
17
  const result = await runVerifier(
@@ -25,10 +27,11 @@ describe('verifier_core.mjs', () => {
25
 
26
  expect(provider.generate).toHaveBeenCalledOnce();
27
  expect(result.ok).toBe(true);
28
- expect(result.raw.startsWith('YES')).toBe(true);
 
29
  });
30
 
31
- it('returns ok=false when first line is NO', async () => {
32
  const sample = {
33
  question: 'What is love?',
34
  context: [{ content: 'ctx' }],
@@ -36,7 +39,9 @@ describe('verifier_core.mjs', () => {
36
  };
37
 
38
  const provider = {
39
- generate: vi.fn(async () => 'NO\nIncorrect interpretation.'),
 
 
40
  };
41
 
42
  const result = await runVerifier(
@@ -49,10 +54,10 @@ describe('verifier_core.mjs', () => {
49
  );
50
 
51
  expect(result.ok).toBe(false);
52
- expect(result.raw.startsWith('NO')).toBe(true);
53
  });
54
 
55
- it('accepts lowercase/whitespace yes on first line', async () => {
56
  const sample = {
57
  question: 'What is love?',
58
  context: [{ content: 'ctx' }],
@@ -60,7 +65,9 @@ describe('verifier_core.mjs', () => {
60
  };
61
 
62
  const provider = {
63
- generate: vi.fn(async () => 'yes, this is fine\nMore details…'),
 
 
64
  };
65
 
66
  const result = await runVerifier(
@@ -73,5 +80,7 @@ describe('verifier_core.mjs', () => {
73
  );
74
 
75
  expect(result.ok).toBe(true);
 
 
76
  });
77
  });
 
3
  import { runVerifier } from '../src/verifier/verifier_core.mjs';
4
 
5
  describe('verifier_core.mjs', () => {
6
+ it('returns ok=true when SCORE >= 0.5 JSON', async () => {
7
  const sample = {
8
  question: 'What is love?',
9
  context: [{ content: 'ctx' }],
 
11
  };
12
 
13
  const provider = {
14
+ generate: vi.fn(async () =>
15
+ JSON.stringify({ REASONING: ['ok'], SCORE: 0.9 }),
16
+ ),
17
  };
18
 
19
  const result = await runVerifier(
 
27
 
28
  expect(provider.generate).toHaveBeenCalledOnce();
29
  expect(result.ok).toBe(true);
30
+ expect(result.score).toBe(0.9);
31
+ expect(result.reasoning).toEqual(['ok']);
32
  });
33
 
34
+ it('returns ok=false when SCORE < 0.5 JSON', async () => {
35
  const sample = {
36
  question: 'What is love?',
37
  context: [{ content: 'ctx' }],
 
39
  };
40
 
41
  const provider = {
42
+ generate: vi.fn(async () =>
43
+ JSON.stringify({ REASONING: ['bad'], SCORE: 0.1 }),
44
+ ),
45
  };
46
 
47
  const result = await runVerifier(
 
54
  );
55
 
56
  expect(result.ok).toBe(false);
57
+ expect(result.score).toBe(0.1);
58
  });
59
 
60
+ it('handles string SCORE and preserves reasoning', async () => {
61
  const sample = {
62
  question: 'What is love?',
63
  context: [{ content: 'ctx' }],
 
65
  };
66
 
67
  const provider = {
68
+ generate: vi.fn(async () =>
69
+ JSON.stringify({ REASONING: ['fine'], SCORE: '0.7' }),
70
+ ),
71
  };
72
 
73
  const result = await runVerifier(
 
80
  );
81
 
82
  expect(result.ok).toBe(true);
83
+ expect(result.score).toBe(0.7);
84
+ expect(result.reasoning).toEqual(['fine']);
85
  });
86
  });