verifier works better now
Browse files- src/pipeline/step.mjs +22 -1
- src/verifier/verifier_core.mjs +13 -0
- tests/verifier_core.test.mjs +20 -0
src/pipeline/step.mjs
CHANGED
|
@@ -158,11 +158,32 @@ export async function runPipelineStep({
|
|
| 158 |
|
| 159 |
if (verbose) {
|
| 160 |
if (gen?.thought) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
log(' [generator] thought:');
|
| 162 |
-
log(' ' + preview(
|
| 163 |
}
|
| 164 |
log(' [generator] answer:');
|
| 165 |
log(' ' + preview(gen?.answer ?? '', 400).replace(/\n/g, '\n '));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
}
|
| 167 |
} catch (e) {
|
| 168 |
const msg = e?.message || String(e);
|
|
|
|
| 158 |
|
| 159 |
if (verbose) {
|
| 160 |
if (gen?.thought) {
|
| 161 |
+
const thoughtPreview =
|
| 162 |
+
typeof gen.thought === 'string'
|
| 163 |
+
? gen.thought
|
| 164 |
+
: JSON.stringify(gen.thought, null, 2);
|
| 165 |
log(' [generator] thought:');
|
| 166 |
+
log(' ' + preview(thoughtPreview, 500).replace(/\n/g, '\n '));
|
| 167 |
}
|
| 168 |
log(' [generator] answer:');
|
| 169 |
log(' ' + preview(gen?.answer ?? '', 400).replace(/\n/g, '\n '));
|
| 170 |
+
if (gen?.confidence) {
|
| 171 |
+
log(' [generator] confidence: ' + gen.confidence);
|
| 172 |
+
}
|
| 173 |
+
if (gen?.evidence) {
|
| 174 |
+
log(
|
| 175 |
+
' [generator] evidence: ' +
|
| 176 |
+
preview(
|
| 177 |
+
Array.isArray(gen.evidence)
|
| 178 |
+
? gen.evidence.join(' | ')
|
| 179 |
+
: String(gen.evidence),
|
| 180 |
+
400,
|
| 181 |
+
).replace(/\n/g, '\n '),
|
| 182 |
+
);
|
| 183 |
+
}
|
| 184 |
+
if (gen?.limitations) {
|
| 185 |
+
log(' [generator] limitations: ' + preview(gen.limitations, 200));
|
| 186 |
+
}
|
| 187 |
}
|
| 188 |
} catch (e) {
|
| 189 |
const msg = e?.message || String(e);
|
src/verifier/verifier_core.mjs
CHANGED
|
@@ -121,6 +121,19 @@ export async function runVerifier({ question, context, gen }, provider) {
|
|
| 121 |
}
|
| 122 |
}
|
| 123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
return { raw, ok, score, reasoning, error };
|
| 125 |
}
|
| 126 |
|
|
|
|
| 121 |
}
|
| 122 |
}
|
| 123 |
|
| 124 |
+
// Fallback: raw PASS/FAIL tokens even if parsing failed
|
| 125 |
+
if (!ok && typeof raw === 'string') {
|
| 126 |
+
if (/pass/i.test(raw) && !/fail/i.test(raw)) {
|
| 127 |
+
score = score ?? 'PASS';
|
| 128 |
+
ok = true;
|
| 129 |
+
error = null;
|
| 130 |
+
} else if (/fail/i.test(raw) && !/pass/i.test(raw)) {
|
| 131 |
+
score = score ?? 'FAIL';
|
| 132 |
+
ok = false;
|
| 133 |
+
error = null;
|
| 134 |
+
}
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
return { raw, ok, score, reasoning, error };
|
| 138 |
}
|
| 139 |
|
tests/verifier_core.test.mjs
CHANGED
|
@@ -160,4 +160,24 @@ describe('verifier_core.mjs', () => {
|
|
| 160 |
expect(res.score).toBe('PASS');
|
| 161 |
expect(Array.isArray(res.reasoning) || res.reasoning == null).toBe(true);
|
| 162 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
});
|
|
|
|
| 160 |
expect(res.score).toBe('PASS');
|
| 161 |
expect(Array.isArray(res.reasoning) || res.reasoning == null).toBe(true);
|
| 162 |
});
|
| 163 |
+
|
| 164 |
+
it('treats raw PASS token as ok', async () => {
|
| 165 |
+
const sample = {
|
| 166 |
+
question: 'What is love?',
|
| 167 |
+
context: [{ content: 'ctx' }],
|
| 168 |
+
gen: { answer: 'Reasonable answer', raw: 'Reasonable answer' },
|
| 169 |
+
};
|
| 170 |
+
|
| 171 |
+
const provider = {
|
| 172 |
+
generate: vi.fn(async () => 'PROMPT = PASS'),
|
| 173 |
+
};
|
| 174 |
+
|
| 175 |
+
const res = await runVerifier(
|
| 176 |
+
{ question: sample.question, context: sample.context, gen: sample.gen },
|
| 177 |
+
provider,
|
| 178 |
+
);
|
| 179 |
+
|
| 180 |
+
expect(res.ok).toBe(true);
|
| 181 |
+
expect(res.score).toBe('PASS');
|
| 182 |
+
});
|
| 183 |
});
|