updateda bunch of stuff
Browse files- AGENTS.md +3 -0
- README.md +1 -0
- prompts/generator_prompt.txt +45 -55
- scripts/cache_report.mjs +104 -0
- scripts/gold_preview.mjs +193 -0
- scripts/purge_mock_gold.mjs +66 -0
- scripts/regenerate_gold_from_cache.mjs +192 -0
- src/generator/generator_core.mjs +115 -8
- src/pipeline/batch.mjs +31 -2
- src/reward/reward_core.mjs +1 -1
- state_of_project.md +24 -0
- tests/generator_core.test.mjs +30 -10
- tests/gold_preview.test.mjs +65 -0
- tests/regenerate_gold_from_cache.test.mjs +112 -0
AGENTS.md
CHANGED
|
@@ -6,14 +6,17 @@
|
|
| 6 |
- Tests sit in `tests/` (Vitest), with sample seeds in `test_samples/`; pipeline outputs write to `gold/`.
|
| 7 |
- Config baselines (models, limits) are in `configs/pipeline.json`; run scripts live at the repo root (`run.sh`, `try_prompt.sh`).
|
| 8 |
- Cached intermediates (questions/gens/verifications/rewards) live in `data/cache/*.jsonl`; set `PIPELINE_CACHE_DIR` to redirect.
|
|
|
|
| 9 |
|
| 10 |
## Build, Test, and Development Commands
|
| 11 |
- `npm install` β install dependencies.
|
| 12 |
- `npm run pipeline -- --limit 20 --verbose` β run the default pipeline using static seeds.
|
| 13 |
- `PIPELINE_SEED_MODE=question-first npm run pipeline -- --limit 20 --verbose` β enable question-first seeding.
|
|
|
|
| 14 |
- `npm test` β run all unit tests (mocked by default).
|
| 15 |
- `REAL_ES=1 npm test` β exercise retrieval against a live Elasticsearch + embedding endpoint.
|
| 16 |
- Red/green pathway: use `*_PROVIDER=mock` plus JSONL chunk source to dry-run (green) without models; switch to real providers for red runs and the cache will skip already-completed stages.
|
|
|
|
| 17 |
|
| 18 |
## Coding Style & Naming Conventions
|
| 19 |
- ECMAScript modules (`type: "module"`); prefer `.mjs` for shared code.
|
|
|
|
| 6 |
- Tests sit in `tests/` (Vitest), with sample seeds in `test_samples/`; pipeline outputs write to `gold/`.
|
| 7 |
- Config baselines (models, limits) are in `configs/pipeline.json`; run scripts live at the repo root (`run.sh`, `try_prompt.sh`).
|
| 8 |
- Cached intermediates (questions/gens/verifications/rewards) live in `data/cache/*.jsonl`; set `PIPELINE_CACHE_DIR` to redirect.
|
| 9 |
+
- Random walk over chunks: set `PIPELINE_RANDOM_WALK=1` (or `PIPELINE_CHUNK_ORDER=random`) to shuffle chunk order using crypto randomness.
|
| 10 |
|
| 11 |
## Build, Test, and Development Commands
|
| 12 |
- `npm install` β install dependencies.
|
| 13 |
- `npm run pipeline -- --limit 20 --verbose` β run the default pipeline using static seeds.
|
| 14 |
- `PIPELINE_SEED_MODE=question-first npm run pipeline -- --limit 20 --verbose` β enable question-first seeding.
|
| 15 |
+
- Random-walk mode: `PIPELINE_RANDOM_WALK=1 QUESTION_MAX_PER_CHUNK=3 npm run pipeline -- --limit 3 --chunk-limit 10` shuffles chunks, caps questions per chunk at 3, processes at most 3 questions overall, and samples up to 10 chunks.
|
| 16 |
- `npm test` β run all unit tests (mocked by default).
|
| 17 |
- `REAL_ES=1 npm test` β exercise retrieval against a live Elasticsearch + embedding endpoint.
|
| 18 |
- Red/green pathway: use `*_PROVIDER=mock` plus JSONL chunk source to dry-run (green) without models; switch to real providers for red runs and the cache will skip already-completed stages.
|
| 19 |
+
- Verifier contract: models return JSON `{"REASONING": [...], "SCORE": <number|\"PASS\"|\"FAIL\">}`; SCORE >=0.5 or PASS β accepted. Prompt must remain unchanged; parsing is tolerant of the PASS/FAIL token format.
|
| 20 |
|
| 21 |
## Coding Style & Naming Conventions
|
| 22 |
- ECMAScript modules (`type: "module"`); prefer `.mjs` for shared code.
|
README.md
CHANGED
|
@@ -82,6 +82,7 @@ All pure modules include Vitest coverage:
|
|
| 82 |
* question generation
|
| 83 |
* provider router
|
| 84 |
* pipeline integration (mock)
|
|
|
|
| 85 |
|
| 86 |
---
|
| 87 |
|
|
|
|
| 82 |
* question generation
|
| 83 |
* provider router
|
| 84 |
* pipeline integration (mock)
|
| 85 |
+
* JSONL cache, PASS/FAIL verifier parsing
|
| 86 |
|
| 87 |
---
|
| 88 |
|
prompts/generator_prompt.txt
CHANGED
|
@@ -1,61 +1,51 @@
|
|
| 1 |
-
|
| 2 |
-
You are a knowledge distillation generator
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
<
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
- For EACH required
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
<
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
Confidence: [INTEGER 0-100]
|
| 45 |
-
Answer: [CONCISE RESPONSE OR EXACT FALLBACK PHRASE]
|
| 46 |
-
Evidence: [MAX 3 SHORT PHRASES] | [PARA #S]
|
| 47 |
-
Uncertainty_flags: [NONE/CONTEXT_GAPS/CONTRADICTIONS/BIAS_RISK]
|
| 48 |
-
|
| 49 |
-
## STRICT FORMATTING RULES
|
| 50 |
-
- XML tags MUST close properly
|
| 51 |
-
- Evidence phrases: β€7 words each
|
| 52 |
-
- Confidence calculations must show work in <synthesis>
|
| 53 |
-
- If context_verification fails: OUTPUT ONLY "I cannot answer this from the provided context." (NO tags)
|
| 54 |
-
- NEVER use markdown, asterisks, or special formatting
|
| 55 |
|
| 56 |
---
|
| 57 |
CONTEXT:
|
| 58 |
-
{{CONTEXT}}
|
|
|
|
| 59 |
|
| 60 |
QUESTION:
|
| 61 |
{{QUESTION}}
|
|
|
|
| 1 |
+
<|system|>
|
| 2 |
+
You are a knowledge distillation generator for training compact reasoning models. Your outputs MUST demonstrate pedagogical reasoning fidelity using Qwen3's native thinking protocol. Every output becomes gold training data.
|
| 3 |
+
|
| 4 |
+
<|rules|>
|
| 5 |
+
NON-NEGOTIABLE:
|
| 6 |
+
1. CONTEXT FIDELITY: Use ONLY visible elements in the CONTEXT (text, objects, relationships). No external knowledge.
|
| 7 |
+
2. VL-AWARE GROUNDING: Reference visual/text elements by position ("top-left graphic", "second paragraph") β NOT paragraph numbers.
|
| 8 |
+
3. CONFIDENCE SIMPLIFICATION: Use ONLY [High/Medium/Low] confidence levels β NO percentages or deltas.
|
| 9 |
+
4. FAILURE MODE: If context lacks critical evidence β output EXACTLY: "I cannot answer this from the provided context."
|
| 10 |
+
5. BIAS FLAGGING: Add "LIMITATION:" note ONLY for severe gaps/contradictions.
|
| 11 |
+
|
| 12 |
+
<|reasoning_protocol|>
|
| 13 |
+
<|thought|>
|
| 14 |
+
1. UNDERSTAND QUESTION:
|
| 15 |
+
- Break into atomic sub-questions
|
| 16 |
+
- Classify: [Factual] / [Visual-Text Fusion] / [Multi-Step Inference] / [Ambiguous]
|
| 17 |
+
|
| 18 |
+
2. CONTEXT VERIFICATION:
|
| 19 |
+
- For EACH required element:
|
| 20 |
+
β’ Describe location: "Table 3 in bottom-right", "Caption under Figure 2"
|
| 21 |
+
β’ Quote EXACT visible text snippet (β€10 words)
|
| 22 |
+
β’ Mark quality: [Clear] / [Blurry/Partial] / [Contradictory]
|
| 23 |
+
|
| 24 |
+
3. STEPWISE REASONING:
|
| 25 |
+
Step 1: [Action type: e.g., "Extract value from chart"]
|
| 26 |
+
Evidence: "Exact phrase" (location description)
|
| 27 |
+
Confidence: High/Medium/Low + 3-word reason ("low-contrast text")
|
| 28 |
+
|
| 29 |
+
Step 2: [Next action]... (repeat as needed)
|
| 30 |
+
|
| 31 |
+
4. SYNTHESIS:
|
| 32 |
+
- Resolve conflicts between steps
|
| 33 |
+
- Final confidence: [High/Medium/Low]
|
| 34 |
+
- LIMITATION: [None / Missing visual element / Text ambiguity]
|
| 35 |
+
<|end_of_thought|>
|
| 36 |
+
|
| 37 |
+
<|output_format|>
|
| 38 |
+
<|answer|>
|
| 39 |
+
Confidence: [High/Medium/Low]
|
| 40 |
+
Answer: [Concise response OR exact failure phrase]
|
| 41 |
+
Evidence: ["Phrase 1" (location), "Phrase 2" (location)]
|
| 42 |
+
Limitations: [None / ...]
|
| 43 |
+
<|end_of_answer|>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
---
|
| 46 |
CONTEXT:
|
| 47 |
+
{{CONTEXT}}
|
| 48 |
+
*(Note: For VL models, this contains BOTH text + visual scene description)*
|
| 49 |
|
| 50 |
QUESTION:
|
| 51 |
{{QUESTION}}
|
scripts/cache_report.mjs
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
// scripts/cache_report.mjs
|
| 3 |
+
// Summarize cache status (questions/generations/verifications/rewards).
|
| 4 |
+
|
| 5 |
+
import fs from 'fs/promises';
|
| 6 |
+
import path from 'path';
|
| 7 |
+
import { fileURLToPath } from 'url';
|
| 8 |
+
|
| 9 |
+
const __filename = fileURLToPath(import.meta.url);
|
| 10 |
+
const __dirname = path.dirname(__filename);
|
| 11 |
+
const PROJECT_ROOT = path.join(__dirname, '..');
|
| 12 |
+
|
| 13 |
+
const CACHE_DIR = (() => {
|
| 14 |
+
const custom = process.env.PIPELINE_CACHE_DIR;
|
| 15 |
+
if (custom) {
|
| 16 |
+
return path.isAbsolute(custom)
|
| 17 |
+
? custom
|
| 18 |
+
: path.join(PROJECT_ROOT, custom);
|
| 19 |
+
}
|
| 20 |
+
return path.join(PROJECT_ROOT, 'data', 'cache');
|
| 21 |
+
})();
|
| 22 |
+
|
| 23 |
+
const FILES = {
|
| 24 |
+
questions: 'questions.jsonl',
|
| 25 |
+
generations: 'generations.jsonl',
|
| 26 |
+
verifications: 'verifications.jsonl',
|
| 27 |
+
rewards: 'rewards.jsonl',
|
| 28 |
+
};
|
| 29 |
+
|
| 30 |
+
async function readJsonl(fileName) {
|
| 31 |
+
const filePath = path.join(CACHE_DIR, fileName);
|
| 32 |
+
try {
|
| 33 |
+
const txt = await fs.readFile(filePath, 'utf8');
|
| 34 |
+
return txt
|
| 35 |
+
.split('\n')
|
| 36 |
+
.map((l) => l.trim())
|
| 37 |
+
.filter(Boolean)
|
| 38 |
+
.map((line) => {
|
| 39 |
+
try {
|
| 40 |
+
return JSON.parse(line);
|
| 41 |
+
} catch {
|
| 42 |
+
return null;
|
| 43 |
+
}
|
| 44 |
+
})
|
| 45 |
+
.filter(Boolean);
|
| 46 |
+
} catch (e) {
|
| 47 |
+
if (e.code === 'ENOENT') return [];
|
| 48 |
+
throw e;
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
function uniq(arr) {
|
| 53 |
+
return [...new Set(arr)];
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
async function main() {
|
| 57 |
+
const questions = await readJsonl(FILES.questions);
|
| 58 |
+
const generations = await readJsonl(FILES.generations);
|
| 59 |
+
const verifications = await readJsonl(FILES.verifications);
|
| 60 |
+
const rewards = await readJsonl(FILES.rewards);
|
| 61 |
+
|
| 62 |
+
const chunkIds = uniq([
|
| 63 |
+
...questions.map((r) => r.chunk_id),
|
| 64 |
+
...generations.map((r) => r.chunk_id),
|
| 65 |
+
...verifications.map((r) => r.chunk_id),
|
| 66 |
+
...rewards.map((r) => r.chunk_id),
|
| 67 |
+
].filter(Boolean));
|
| 68 |
+
|
| 69 |
+
const totalQuestions = questions.reduce((acc, r) => {
|
| 70 |
+
if (Array.isArray(r.questions)) return acc + r.questions.length;
|
| 71 |
+
if (Array.isArray(r.question_ids)) return acc + r.question_ids.length;
|
| 72 |
+
return acc + 1;
|
| 73 |
+
}, 0);
|
| 74 |
+
|
| 75 |
+
const totalGenerations = generations.length;
|
| 76 |
+
const totalVerifications = verifications.length;
|
| 77 |
+
const totalRewards = rewards.length;
|
| 78 |
+
|
| 79 |
+
const passedVerifications = verifications.filter((v) => v.ok === true).length;
|
| 80 |
+
const passedRewards = rewards.filter((r) => r.ok === true).length;
|
| 81 |
+
|
| 82 |
+
const rows = [
|
| 83 |
+
['Cache dir', CACHE_DIR],
|
| 84 |
+
['Unique chunks', chunkIds.length],
|
| 85 |
+
['Question records', questions.length],
|
| 86 |
+
['Questions total', totalQuestions],
|
| 87 |
+
['Generation records', totalGenerations],
|
| 88 |
+
['Verification records', totalVerifications],
|
| 89 |
+
['Verifications ok', passedVerifications],
|
| 90 |
+
['Reward records', totalRewards],
|
| 91 |
+
['Rewards ok', passedRewards],
|
| 92 |
+
];
|
| 93 |
+
|
| 94 |
+
const colWidth = Math.max(...rows.map(([k]) => k.length)) + 2;
|
| 95 |
+
for (const [key, val] of rows) {
|
| 96 |
+
const pad = ' '.repeat(colWidth - key.length);
|
| 97 |
+
console.log(`${key}:${pad}${val}`);
|
| 98 |
+
}
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
main().catch((err) => {
|
| 102 |
+
console.error('Cache report error:', err);
|
| 103 |
+
process.exit(1);
|
| 104 |
+
});
|
scripts/gold_preview.mjs
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
// scripts/gold_preview.mjs
|
| 3 |
+
// Quick preview of gold JSONL entries (questions and answers).
|
| 4 |
+
|
| 5 |
+
import fs from 'fs/promises';
|
| 6 |
+
import path from 'path';
|
| 7 |
+
import { fileURLToPath } from 'url';
|
| 8 |
+
|
| 9 |
+
const __filename = fileURLToPath(import.meta.url);
|
| 10 |
+
const __dirname = path.dirname(__filename);
|
| 11 |
+
const PROJECT_ROOT = path.join(__dirname, '..');
|
| 12 |
+
|
| 13 |
+
function parseArgs() {
|
| 14 |
+
const args = process.argv.slice(2);
|
| 15 |
+
let limit = 5;
|
| 16 |
+
let fileArg;
|
| 17 |
+
let full = false;
|
| 18 |
+
let maxQuestion = 500;
|
| 19 |
+
let maxAnswer = 800;
|
| 20 |
+
let maxContext = 300;
|
| 21 |
+
|
| 22 |
+
for (let i = 0; i < args.length; i++) {
|
| 23 |
+
const a = args[i];
|
| 24 |
+
if (a === '--limit' || a === '-n') {
|
| 25 |
+
const v = Number(args[i + 1]);
|
| 26 |
+
if (Number.isFinite(v)) limit = v;
|
| 27 |
+
i++;
|
| 28 |
+
} else if (a === '--file' || a === '-f') {
|
| 29 |
+
fileArg = args[i + 1];
|
| 30 |
+
i++;
|
| 31 |
+
} else if (a === '--full') {
|
| 32 |
+
full = true;
|
| 33 |
+
} else if (a === '--max-question') {
|
| 34 |
+
const v = Number(args[i + 1]);
|
| 35 |
+
if (Number.isFinite(v)) maxQuestion = v;
|
| 36 |
+
i++;
|
| 37 |
+
} else if (a === '--max-answer') {
|
| 38 |
+
const v = Number(args[i + 1]);
|
| 39 |
+
if (Number.isFinite(v)) maxAnswer = v;
|
| 40 |
+
i++;
|
| 41 |
+
} else if (a === '--max-context') {
|
| 42 |
+
const v = Number(args[i + 1]);
|
| 43 |
+
if (Number.isFinite(v)) maxContext = v;
|
| 44 |
+
i++;
|
| 45 |
+
}
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
const goldPath =
|
| 49 |
+
fileArg ||
|
| 50 |
+
process.env.GOLD_PATH ||
|
| 51 |
+
path.join(PROJECT_ROOT, 'gold', 'pipeline_gold.jsonl');
|
| 52 |
+
|
| 53 |
+
if (full) {
|
| 54 |
+
maxQuestion = Infinity;
|
| 55 |
+
maxAnswer = Infinity;
|
| 56 |
+
maxContext = Infinity;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
return { limit, goldPath, full, maxQuestion, maxAnswer, maxContext };
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
function preview(text, max = 200, full = false) {
|
| 63 |
+
if (full) return Array.isArray(text) ? text.join(' ') : String(text ?? '');
|
| 64 |
+
if (text == null) return '';
|
| 65 |
+
const str = Array.isArray(text) ? text.join(' ') : String(text);
|
| 66 |
+
if (str.length <= max) return str;
|
| 67 |
+
return str.slice(0, max) + `β¦ [+${str.length - max} chars]`;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
async function main() {
|
| 71 |
+
const {
|
| 72 |
+
limit,
|
| 73 |
+
goldPath,
|
| 74 |
+
full,
|
| 75 |
+
maxQuestion,
|
| 76 |
+
maxAnswer,
|
| 77 |
+
maxContext,
|
| 78 |
+
} = parseArgs();
|
| 79 |
+
|
| 80 |
+
let raw;
|
| 81 |
+
try {
|
| 82 |
+
raw = await fs.readFile(goldPath, 'utf8');
|
| 83 |
+
} catch (e) {
|
| 84 |
+
if (e.code === 'ENOENT') {
|
| 85 |
+
console.error(`Gold file not found: ${goldPath}`);
|
| 86 |
+
process.exit(1);
|
| 87 |
+
}
|
| 88 |
+
throw e;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
const lines = raw
|
| 92 |
+
.split('\n')
|
| 93 |
+
.map((l) => l.trim())
|
| 94 |
+
.filter(Boolean)
|
| 95 |
+
.slice(0, limit);
|
| 96 |
+
|
| 97 |
+
console.log(`Gold preview (${lines.length} of max ${limit}) from ${goldPath}\n`);
|
| 98 |
+
|
| 99 |
+
lines.forEach((line, idx) => {
|
| 100 |
+
let obj;
|
| 101 |
+
try {
|
| 102 |
+
obj = JSON.parse(line);
|
| 103 |
+
} catch {
|
| 104 |
+
console.log(`#${idx + 1}: [invalid JSON] ${preview(line, 120)}`);
|
| 105 |
+
return;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
const q = obj.question || '[no question]';
|
| 109 |
+
const ans = obj.sample?.answer || obj.sample?.raw || '[no answer]';
|
| 110 |
+
const chunkId = obj.sourceChunkId || obj.context?.[0]?.id || '[unknown chunk]';
|
| 111 |
+
const ctxSnippet = obj.context?.[0]?.content || obj.sourceChunk || '';
|
| 112 |
+
const rew = obj.reward?.score ?? obj.reward?.ok;
|
| 113 |
+
const verOk = obj.verifier?.ok ?? obj.ver?.ok;
|
| 114 |
+
const verScore = obj.verifier?.score ?? obj.ver?.score;
|
| 115 |
+
|
| 116 |
+
console.log(`#${idx + 1}`);
|
| 117 |
+
console.log(`Chunk: ${chunkId}`);
|
| 118 |
+
console.log(`Q: ${preview(q, maxQuestion, full)}`);
|
| 119 |
+
console.log(`A: ${preview(ans, maxAnswer, full)}`);
|
| 120 |
+
if (ctxSnippet) console.log(`Ctx: ${preview(ctxSnippet, maxContext, full)}`);
|
| 121 |
+
if (verOk !== undefined) console.log(`Verifier ok: ${verOk}${verScore !== undefined ? ` (score: ${verScore})` : ''}`);
|
| 122 |
+
if (rew !== undefined) console.log(`Reward: ${rew}`);
|
| 123 |
+
console.log('');
|
| 124 |
+
});
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
// Exported for tests
|
| 128 |
+
export async function capturePreview() {
|
| 129 |
+
const {
|
| 130 |
+
limit,
|
| 131 |
+
goldPath,
|
| 132 |
+
full,
|
| 133 |
+
maxQuestion,
|
| 134 |
+
maxAnswer,
|
| 135 |
+
maxContext,
|
| 136 |
+
} = parseArgs();
|
| 137 |
+
|
| 138 |
+
let raw;
|
| 139 |
+
try {
|
| 140 |
+
raw = await fs.readFile(goldPath, 'utf8');
|
| 141 |
+
} catch (e) {
|
| 142 |
+
if (e.code === 'ENOENT') {
|
| 143 |
+
throw new Error(`Gold file not found: ${goldPath}`);
|
| 144 |
+
}
|
| 145 |
+
throw e;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
const lines = raw
|
| 149 |
+
.split('\n')
|
| 150 |
+
.map((l) => l.trim())
|
| 151 |
+
.filter(Boolean)
|
| 152 |
+
.slice(0, limit);
|
| 153 |
+
|
| 154 |
+
const chunks = [];
|
| 155 |
+
|
| 156 |
+
chunks.push(`Gold preview (${lines.length} of max ${limit}) from ${goldPath}\n`);
|
| 157 |
+
|
| 158 |
+
lines.forEach((line, idx) => {
|
| 159 |
+
let obj;
|
| 160 |
+
try {
|
| 161 |
+
obj = JSON.parse(line);
|
| 162 |
+
} catch {
|
| 163 |
+
chunks.push(`#${idx + 1}: [invalid JSON] ${preview(line, 120)}`);
|
| 164 |
+
return;
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
const q = obj.question || '[no question]';
|
| 168 |
+
const ans = obj.sample?.answer || obj.sample?.raw || '[no answer]';
|
| 169 |
+
const chunkId = obj.sourceChunkId || obj.context?.[0]?.id || '[unknown chunk]';
|
| 170 |
+
const ctxSnippet = obj.context?.[0]?.content || obj.sourceChunk || '';
|
| 171 |
+
const rew = obj.reward?.score ?? obj.reward?.ok;
|
| 172 |
+
const verOk = obj.verifier?.ok ?? obj.ver?.ok;
|
| 173 |
+
const verScore = obj.verifier?.score ?? obj.ver?.score;
|
| 174 |
+
|
| 175 |
+
chunks.push(`#${idx + 1}`);
|
| 176 |
+
chunks.push(`Chunk: ${chunkId}`);
|
| 177 |
+
chunks.push(`Q: ${preview(q, maxQuestion, full)}`);
|
| 178 |
+
chunks.push(`A: ${preview(ans, maxAnswer, full)}`);
|
| 179 |
+
if (ctxSnippet) chunks.push(`Ctx: ${preview(ctxSnippet, maxContext, full)}`);
|
| 180 |
+
if (verOk !== undefined) chunks.push(`Verifier ok: ${verOk}${verScore !== undefined ? ` (score: ${verScore})` : ''}`);
|
| 181 |
+
if (rew !== undefined) chunks.push(`Reward: ${rew}`);
|
| 182 |
+
chunks.push('');
|
| 183 |
+
});
|
| 184 |
+
|
| 185 |
+
return chunks.join('\n');
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
if (process.argv[1] && process.argv[1].endsWith('gold_preview.mjs')) {
|
| 189 |
+
main().catch((err) => {
|
| 190 |
+
console.error('Gold preview error:', err);
|
| 191 |
+
process.exit(1);
|
| 192 |
+
});
|
| 193 |
+
}
|
scripts/purge_mock_gold.mjs
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
// scripts/purge_mock_gold.mjs
|
| 3 |
+
// Remove gold entries with mock/test questions (e.g., "Q1?") from pipeline_gold.jsonl
|
| 4 |
+
|
| 5 |
+
import fs from 'fs/promises';
|
| 6 |
+
import path from 'path';
|
| 7 |
+
import { fileURLToPath } from 'url';
|
| 8 |
+
|
| 9 |
+
const __filename = fileURLToPath(import.meta.url);
|
| 10 |
+
const __dirname = path.dirname(__filename);
|
| 11 |
+
const PROJECT_ROOT = path.join(__dirname, '..');
|
| 12 |
+
|
| 13 |
+
const GOLD_PATH =
|
| 14 |
+
process.env.GOLD_PATH ||
|
| 15 |
+
path.join(PROJECT_ROOT, 'gold', 'pipeline_gold.jsonl');
|
| 16 |
+
|
| 17 |
+
const questionRegex = new RegExp(process.env.PURGE_QUESTION_REGEX || '^Q1\\?$');
|
| 18 |
+
|
| 19 |
+
async function main() {
|
| 20 |
+
let raw;
|
| 21 |
+
try {
|
| 22 |
+
raw = await fs.readFile(GOLD_PATH, 'utf8');
|
| 23 |
+
} catch (e) {
|
| 24 |
+
if (e.code === 'ENOENT') {
|
| 25 |
+
console.error(`Gold file not found: ${GOLD_PATH}`);
|
| 26 |
+
process.exit(1);
|
| 27 |
+
}
|
| 28 |
+
throw e;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
const lines = raw
|
| 32 |
+
.split('\n')
|
| 33 |
+
.map((l) => l.trim())
|
| 34 |
+
.filter(Boolean);
|
| 35 |
+
|
| 36 |
+
const kept = [];
|
| 37 |
+
const removed = [];
|
| 38 |
+
|
| 39 |
+
for (const line of lines) {
|
| 40 |
+
try {
|
| 41 |
+
const obj = JSON.parse(line);
|
| 42 |
+
const q = obj.question || '';
|
| 43 |
+
if (questionRegex.test(q)) {
|
| 44 |
+
removed.push(line);
|
| 45 |
+
} else {
|
| 46 |
+
kept.push(line);
|
| 47 |
+
}
|
| 48 |
+
} catch {
|
| 49 |
+
// if invalid JSON, keep it to avoid accidental loss
|
| 50 |
+
kept.push(line);
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
if (removed.length === 0) {
|
| 55 |
+
console.log('No matching entries to purge.');
|
| 56 |
+
return;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
await fs.writeFile(GOLD_PATH, kept.join('\n') + '\n', 'utf8');
|
| 60 |
+
console.log(`Purged ${removed.length} entries matching ${questionRegex}. Kept ${kept.length}.`);
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
main().catch((err) => {
|
| 64 |
+
console.error('Purge error:', err);
|
| 65 |
+
process.exit(1);
|
| 66 |
+
});
|
scripts/regenerate_gold_from_cache.mjs
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
// scripts/regenerate_gold_from_cache.mjs
|
| 3 |
+
// Regenerate gold/pipeline_gold.jsonl from cache JSONL files.
|
| 4 |
+
|
| 5 |
+
import fs from 'fs/promises';
|
| 6 |
+
import path from 'path';
|
| 7 |
+
import { fileURLToPath } from 'url';
|
| 8 |
+
import { loadRagChunks } from '../src/retrieval/jsonl_chunks.mjs';
|
| 9 |
+
import {
|
| 10 |
+
questionId,
|
| 11 |
+
chunkIdFromContent,
|
| 12 |
+
} from '../src/pipeline/cache.mjs';
|
| 13 |
+
|
| 14 |
+
const __filename = fileURLToPath(import.meta.url);
|
| 15 |
+
const __dirname = path.dirname(__filename);
|
| 16 |
+
const PROJECT_ROOT = path.join(__dirname, '..');
|
| 17 |
+
|
| 18 |
+
const CACHE_DIR = (() => {
|
| 19 |
+
const custom = process.env.PIPELINE_CACHE_DIR;
|
| 20 |
+
if (custom) {
|
| 21 |
+
return path.isAbsolute(custom)
|
| 22 |
+
? custom
|
| 23 |
+
: path.join(PROJECT_ROOT, custom);
|
| 24 |
+
}
|
| 25 |
+
return path.join(PROJECT_ROOT, 'data', 'cache');
|
| 26 |
+
})();
|
| 27 |
+
|
| 28 |
+
const GOLD_PATH =
|
| 29 |
+
process.env.GOLD_PATH ||
|
| 30 |
+
path.join(PROJECT_ROOT, 'gold', 'pipeline_gold.jsonl');
|
| 31 |
+
|
| 32 |
+
const CACHE_FILES = {
|
| 33 |
+
questions: 'questions.jsonl',
|
| 34 |
+
generations: 'generations.jsonl',
|
| 35 |
+
verifications: 'verifications.jsonl',
|
| 36 |
+
rewards: 'rewards.jsonl',
|
| 37 |
+
};
|
| 38 |
+
|
| 39 |
+
async function readJsonl(fileName) {
|
| 40 |
+
const filePath = path.join(CACHE_DIR, fileName);
|
| 41 |
+
try {
|
| 42 |
+
const txt = await fs.readFile(filePath, 'utf8');
|
| 43 |
+
return txt
|
| 44 |
+
.split('\n')
|
| 45 |
+
.map((l) => l.trim())
|
| 46 |
+
.filter(Boolean)
|
| 47 |
+
.map((line) => {
|
| 48 |
+
try {
|
| 49 |
+
return JSON.parse(line);
|
| 50 |
+
} catch {
|
| 51 |
+
return null;
|
| 52 |
+
}
|
| 53 |
+
})
|
| 54 |
+
.filter(Boolean);
|
| 55 |
+
} catch (e) {
|
| 56 |
+
if (e.code === 'ENOENT') return [];
|
| 57 |
+
throw e;
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
function compositeKey(...parts) {
|
| 62 |
+
return parts.filter(Boolean).join('|');
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
async function loadChunksMap() {
|
| 66 |
+
const chunks = await loadRagChunks();
|
| 67 |
+
const map = new Map();
|
| 68 |
+
for (const c of chunks) {
|
| 69 |
+
const cid = c.id || chunkIdFromContent(c.content, c.sourceId || c.source?.id);
|
| 70 |
+
map.set(cid, c);
|
| 71 |
+
}
|
| 72 |
+
return map;
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
function latestByTs(records, keyFn) {
|
| 76 |
+
const map = new Map();
|
| 77 |
+
for (const r of records) {
|
| 78 |
+
const key = keyFn(r);
|
| 79 |
+
if (!key) continue;
|
| 80 |
+
const existing = map.get(key);
|
| 81 |
+
if (!existing || (r.ts && (!existing.ts || r.ts > existing.ts))) {
|
| 82 |
+
map.set(key, r);
|
| 83 |
+
}
|
| 84 |
+
}
|
| 85 |
+
return map;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
function rewardOk(r) {
|
| 89 |
+
if (!r) return false;
|
| 90 |
+
if (r.ok === true) return true;
|
| 91 |
+
if (typeof r.score === 'number') return r.score >= 0.5;
|
| 92 |
+
if (typeof r.score === 'string') {
|
| 93 |
+
const t = r.score.trim().toLowerCase();
|
| 94 |
+
if (t === 'pass') return true;
|
| 95 |
+
const num = Number(r.score);
|
| 96 |
+
if (Number.isFinite(num)) return num >= 0.5;
|
| 97 |
+
}
|
| 98 |
+
return false;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
async function main() {
|
| 102 |
+
const [questions, generations, verifications, rewards] = await Promise.all([
|
| 103 |
+
readJsonl(CACHE_FILES.questions),
|
| 104 |
+
readJsonl(CACHE_FILES.generations),
|
| 105 |
+
readJsonl(CACHE_FILES.verifications),
|
| 106 |
+
readJsonl(CACHE_FILES.rewards),
|
| 107 |
+
]);
|
| 108 |
+
|
| 109 |
+
const chunkMap = await loadChunksMap();
|
| 110 |
+
|
| 111 |
+
// Build questionId -> question text map
|
| 112 |
+
const qMap = new Map();
|
| 113 |
+
for (const rec of questions) {
|
| 114 |
+
const chunkId = rec.chunk_id;
|
| 115 |
+
if (!chunkId) continue;
|
| 116 |
+
const qs = Array.isArray(rec.questions)
|
| 117 |
+
? rec.questions
|
| 118 |
+
: rec.question
|
| 119 |
+
? [rec.question]
|
| 120 |
+
: [];
|
| 121 |
+
const qIds = Array.isArray(rec.question_ids) ? rec.question_ids : [];
|
| 122 |
+
|
| 123 |
+
for (let i = 0; i < qs.length; i++) {
|
| 124 |
+
const q = qs[i];
|
| 125 |
+
const providedId = qIds[i];
|
| 126 |
+
const hashedId = questionId(chunkId, q);
|
| 127 |
+
if (providedId) {
|
| 128 |
+
qMap.set(compositeKey(chunkId, providedId), q);
|
| 129 |
+
}
|
| 130 |
+
qMap.set(compositeKey(chunkId, hashedId), q);
|
| 131 |
+
}
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
// Latest generation per chunk+question (by ts)
|
| 135 |
+
const genMap = latestByTs(generations, (g) =>
|
| 136 |
+
compositeKey(g.chunk_id, g.question_id),
|
| 137 |
+
);
|
| 138 |
+
|
| 139 |
+
// Latest verification per chunk+question+gen
|
| 140 |
+
const verMap = latestByTs(verifications, (v) =>
|
| 141 |
+
compositeKey(v.chunk_id, v.question_id, v.gen_id),
|
| 142 |
+
);
|
| 143 |
+
|
| 144 |
+
// Latest reward per chunk+question+gen
|
| 145 |
+
const rewMap = latestByTs(rewards, (r) =>
|
| 146 |
+
compositeKey(r.chunk_id, r.question_id, r.gen_id),
|
| 147 |
+
);
|
| 148 |
+
|
| 149 |
+
const out = [];
|
| 150 |
+
let accepted = 0;
|
| 151 |
+
for (const [key, gen] of genMap.entries()) {
|
| 152 |
+
const [chunkId, qId] = key.split('|');
|
| 153 |
+
const question = qMap.get(compositeKey(chunkId, qId)) || '[unknown question]';
|
| 154 |
+
const chunk = chunkMap.get(chunkId) || {};
|
| 155 |
+
const context = [{ id: chunkId, content: chunk.content ?? chunk.text ?? '' }];
|
| 156 |
+
const ver = verMap.get(compositeKey(chunkId, qId, gen.gen_id));
|
| 157 |
+
const rew = rewMap.get(compositeKey(chunkId, qId, gen.gen_id));
|
| 158 |
+
|
| 159 |
+
const rewardIsOk = rewardOk(rew);
|
| 160 |
+
const verifierIsOk = ver?.ok === true;
|
| 161 |
+
if (!rewardIsOk && !verifierIsOk) continue;
|
| 162 |
+
accepted += 1;
|
| 163 |
+
|
| 164 |
+
out.push({
|
| 165 |
+
question,
|
| 166 |
+
sourceChunkId: chunkId,
|
| 167 |
+
sourceChunk: chunk.content ?? chunk.text,
|
| 168 |
+
sourceDoc: chunk.source,
|
| 169 |
+
context,
|
| 170 |
+
sample: gen,
|
| 171 |
+
verifier: ver,
|
| 172 |
+
reward: rew,
|
| 173 |
+
});
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
const lines = out.map((r) => JSON.stringify(r));
|
| 177 |
+
await fs.mkdir(path.dirname(GOLD_PATH), { recursive: true });
|
| 178 |
+
await fs.writeFile(GOLD_PATH, lines.join('\n') + '\n', 'utf8');
|
| 179 |
+
|
| 180 |
+
console.log(`Regenerated gold at ${GOLD_PATH}`);
|
| 181 |
+
console.log(`Accepted records: ${accepted}`);
|
| 182 |
+
console.log(`Total written: ${out.length}`);
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
if (import.meta.url === `file://${__filename}`) {
|
| 186 |
+
main().catch((err) => {
|
| 187 |
+
console.error('Regenerate gold error:', err);
|
| 188 |
+
process.exit(1);
|
| 189 |
+
});
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
export { main };
|
src/generator/generator_core.mjs
CHANGED
|
@@ -22,22 +22,129 @@ export async function runGenerator(question, contextChunks, provider) {
|
|
| 22 |
.replace('{{QUESTION}}', question)
|
| 23 |
.replace('{{CONTEXT}}', ctxText);
|
| 24 |
|
| 25 |
-
const
|
| 26 |
|
| 27 |
-
//
|
| 28 |
-
const
|
| 29 |
-
const
|
| 30 |
|
| 31 |
-
|
| 32 |
-
let answer = raw;
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
}
|
| 36 |
|
| 37 |
return {
|
| 38 |
raw,
|
| 39 |
thought,
|
| 40 |
answer,
|
|
|
|
|
|
|
|
|
|
| 41 |
question,
|
| 42 |
context: contextChunks
|
| 43 |
};
|
|
|
|
| 22 |
.replace('{{QUESTION}}', question)
|
| 23 |
.replace('{{CONTEXT}}', ctxText);
|
| 24 |
|
| 25 |
+
const response = await provider.generate(prompt);
|
| 26 |
|
| 27 |
+
// Normalize provider output: string or { response, thinking }
|
| 28 |
+
const raw = typeof response === 'string' ? response : response?.response ?? '';
|
| 29 |
+
const thinkingObj = typeof response === 'object' && response?.thinking ? response.thinking : null;
|
| 30 |
|
| 31 |
+
let thought = null;
|
| 32 |
+
let answer = raw?.trim?.() ?? raw;
|
| 33 |
+
let confidence = null;
|
| 34 |
+
let evidence = null;
|
| 35 |
+
let limitations = null;
|
| 36 |
+
|
| 37 |
+
const safeParse = (txt) => {
|
| 38 |
+
if (!txt || typeof txt !== 'string') return null;
|
| 39 |
+
try {
|
| 40 |
+
return JSON.parse(txt);
|
| 41 |
+
} catch {
|
| 42 |
+
// try to extract braces substring
|
| 43 |
+
const start = txt.indexOf('{');
|
| 44 |
+
const end = txt.lastIndexOf('}');
|
| 45 |
+
if (start !== -1 && end !== -1 && end > start) {
|
| 46 |
+
try {
|
| 47 |
+
return JSON.parse(txt.slice(start, end + 1));
|
| 48 |
+
} catch {
|
| 49 |
+
return null;
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
return null;
|
| 53 |
+
}
|
| 54 |
+
};
|
| 55 |
+
|
| 56 |
+
// Prefer structured thinking object if provided
|
| 57 |
+
if (thinkingObj) {
|
| 58 |
+
thought = thinkingObj;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
// Try parsing Qwen-style answer block first
|
| 62 |
+
const parseAnswerBlock = (txt) => {
|
| 63 |
+
if (!txt || typeof txt !== 'string') return null;
|
| 64 |
+
const blockMatch = txt.match(/<\|answer\|>([\s\S]*?)<\|end_of_answer\|>/i);
|
| 65 |
+
const body = blockMatch ? blockMatch[1] : txt;
|
| 66 |
+
const lines = body.split('\n').map((l) => l.trim()).filter(Boolean);
|
| 67 |
+
const result = {};
|
| 68 |
+
for (const line of lines) {
|
| 69 |
+
if (/^confidence:/i.test(line)) {
|
| 70 |
+
const val = line.split(':')[1]?.trim();
|
| 71 |
+
result.confidence = val || null;
|
| 72 |
+
} else if (/^answer:/i.test(line)) {
|
| 73 |
+
result.answer = line.split(':').slice(1).join(':').trim();
|
| 74 |
+
} else if (/^evidence:/i.test(line)) {
|
| 75 |
+
const evLine = line.split(':').slice(1).join(':').trim();
|
| 76 |
+
// Try to parse bracketed array, else split by comma
|
| 77 |
+
let ev = [];
|
| 78 |
+
const arrMatch = evLine.match(/\[(.*)\]/);
|
| 79 |
+
if (arrMatch) {
|
| 80 |
+
ev = arrMatch[1]
|
| 81 |
+
.split(/,(?=(?:[^'"]|'[^']*'|"[^"]*")*$)/)
|
| 82 |
+
.map((s) => s.replace(/^["'\s]+|["'\s]+$/g, ''))
|
| 83 |
+
.filter(Boolean);
|
| 84 |
+
} else {
|
| 85 |
+
ev = evLine.split(',').map((s) => s.trim()).filter(Boolean);
|
| 86 |
+
}
|
| 87 |
+
result.evidence = ev;
|
| 88 |
+
} else if (/^limitations:/i.test(line)) {
|
| 89 |
+
result.limitations = line.split(':').slice(1).join(':').trim();
|
| 90 |
+
}
|
| 91 |
+
}
|
| 92 |
+
return result;
|
| 93 |
+
};
|
| 94 |
+
|
| 95 |
+
const blockParsed = parseAnswerBlock(raw);
|
| 96 |
+
if (blockParsed?.answer) {
|
| 97 |
+
answer = blockParsed.answer;
|
| 98 |
+
confidence = blockParsed.confidence ?? confidence;
|
| 99 |
+
evidence = blockParsed.evidence ?? evidence;
|
| 100 |
+
limitations = blockParsed.limitations ?? limitations;
|
| 101 |
+
} else {
|
| 102 |
+
// fallback: parse JSON if it's actually JSON
|
| 103 |
+
const parsed = safeParse(raw);
|
| 104 |
+
if (parsed && typeof parsed === 'object') {
|
| 105 |
+
const reasoning = parsed.reasoning || parsed.REASONING;
|
| 106 |
+
if (Array.isArray(reasoning) && !thought) {
|
| 107 |
+
thought = reasoning.join(' ');
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
const ans =
|
| 111 |
+
parsed.answer ||
|
| 112 |
+
parsed.ANSWER ||
|
| 113 |
+
parsed.final ||
|
| 114 |
+
parsed.output;
|
| 115 |
+
if (typeof ans === 'string') {
|
| 116 |
+
answer = ans.trim();
|
| 117 |
+
} else if (Array.isArray(ans)) {
|
| 118 |
+
answer = ans.join(' ').trim();
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
if (parsed.confidence != null) {
|
| 122 |
+
const num = Number(parsed.confidence);
|
| 123 |
+
if (Number.isFinite(num)) confidence = num;
|
| 124 |
+
else if (typeof parsed.confidence === 'string') confidence = parsed.confidence;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
if (parsed.evidence) evidence = parsed.evidence;
|
| 128 |
+
if (parsed.limitations) limitations = parsed.limitations;
|
| 129 |
+
} else {
|
| 130 |
+
// fallback: extract visible chain-of-thought tags if present
|
| 131 |
+
const thinkMatch = typeof raw === 'string'
|
| 132 |
+
? raw.match(/<think>([\s\S]*?)<\/think>/i)
|
| 133 |
+
: null;
|
| 134 |
+
thought = thought || (thinkMatch ? thinkMatch[1].trim() : null);
|
| 135 |
+
if (thinkMatch) {
|
| 136 |
+
answer = raw.slice(thinkMatch.index + thinkMatch[0].length).trim();
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
}
|
| 140 |
|
| 141 |
return {
|
| 142 |
raw,
|
| 143 |
thought,
|
| 144 |
answer,
|
| 145 |
+
confidence,
|
| 146 |
+
evidence,
|
| 147 |
+
limitations,
|
| 148 |
question,
|
| 149 |
context: contextChunks
|
| 150 |
};
|
src/pipeline/batch.mjs
CHANGED
|
@@ -3,6 +3,7 @@ import fs from 'fs/promises';
|
|
| 3 |
import path from 'path';
|
| 4 |
|
| 5 |
import { preview } from './util.mjs';
|
|
|
|
| 6 |
import {
|
| 7 |
DEFAULT_SEEDS_PATH,
|
| 8 |
DEFAULT_OUT_PATH,
|
|
@@ -123,7 +124,14 @@ export async function runPipelineBatch({
|
|
| 123 |
const record = {
|
| 124 |
question,
|
| 125 |
context: result.context,
|
| 126 |
-
sample:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
verifier: result.ver,
|
| 128 |
reward: result.rew,
|
| 129 |
};
|
|
@@ -224,6 +232,23 @@ export async function runPipelineBatch({
|
|
| 224 |
const totalChunks = chunks.length;
|
| 225 |
let processedSeeds = 0;
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
for (let idx = 0; idx < chunks.length; idx++) {
|
| 228 |
if (processed >= questionCap) break;
|
| 229 |
|
|
@@ -396,7 +421,11 @@ export async function runPipelineBatch({
|
|
| 396 |
sourceChunk: contextText,
|
| 397 |
sourceDoc: chunk.source,
|
| 398 |
context: result.context,
|
| 399 |
-
sample:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
verifier: result.ver,
|
| 401 |
reward: result.rew,
|
| 402 |
};
|
|
|
|
| 3 |
import path from 'path';
|
| 4 |
|
| 5 |
import { preview } from './util.mjs';
|
| 6 |
+
import crypto from 'crypto';
|
| 7 |
import {
|
| 8 |
DEFAULT_SEEDS_PATH,
|
| 9 |
DEFAULT_OUT_PATH,
|
|
|
|
| 124 |
const record = {
|
| 125 |
question,
|
| 126 |
context: result.context,
|
| 127 |
+
sample: {
|
| 128 |
+
answer: result.gen?.answer,
|
| 129 |
+
thought: result.gen?.thought,
|
| 130 |
+
raw: result.gen?.raw,
|
| 131 |
+
confidence: result.gen?.confidence,
|
| 132 |
+
evidence: result.gen?.evidence,
|
| 133 |
+
limitations: result.gen?.limitations,
|
| 134 |
+
},
|
| 135 |
verifier: result.ver,
|
| 136 |
reward: result.rew,
|
| 137 |
};
|
|
|
|
| 232 |
const totalChunks = chunks.length;
|
| 233 |
let processedSeeds = 0;
|
| 234 |
|
| 235 |
+
// Optional random walk over chunks
|
| 236 |
+
const randomWalk = (() => {
|
| 237 |
+
const v =
|
| 238 |
+
process.env.PIPELINE_RANDOM_WALK ||
|
| 239 |
+
process.env.PIPELINE_CHUNK_ORDER;
|
| 240 |
+
if (!v) return false;
|
| 241 |
+
const s = String(v).toLowerCase();
|
| 242 |
+
return s === '1' || s === 'true' || s === 'yes' || s === 'random';
|
| 243 |
+
})();
|
| 244 |
+
|
| 245 |
+
if (randomWalk && chunks.length > 1) {
|
| 246 |
+
for (let i = chunks.length - 1; i > 0; i--) {
|
| 247 |
+
const j = crypto.randomInt(i + 1);
|
| 248 |
+
[chunks[i], chunks[j]] = [chunks[j], chunks[i]];
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
for (let idx = 0; idx < chunks.length; idx++) {
|
| 253 |
if (processed >= questionCap) break;
|
| 254 |
|
|
|
|
| 421 |
sourceChunk: contextText,
|
| 422 |
sourceDoc: chunk.source,
|
| 423 |
context: result.context,
|
| 424 |
+
sample: {
|
| 425 |
+
answer: result.gen?.answer,
|
| 426 |
+
thought: result.gen?.thought,
|
| 427 |
+
raw: result.gen?.raw,
|
| 428 |
+
},
|
| 429 |
verifier: result.ver,
|
| 430 |
reward: result.rew,
|
| 431 |
};
|
src/reward/reward_core.mjs
CHANGED
|
@@ -22,7 +22,7 @@ export async function runReward({ question, context, gen }, provider) {
|
|
| 22 |
|
| 23 |
const prompt = tmpl
|
| 24 |
.replace(/{{QUESTION}}/g, question)
|
| 25 |
-
.replace(/{{ANSWER}}/g, gen.answer ||
|
| 26 |
.replace(/{{CONTEXT}}/g, ctxText);
|
| 27 |
|
| 28 |
const raw = await provider.generate(prompt);
|
|
|
|
| 22 |
|
| 23 |
const prompt = tmpl
|
| 24 |
.replace(/{{QUESTION}}/g, question)
|
| 25 |
+
.replace(/{{ANSWER}}/g, gen.answer || '')
|
| 26 |
.replace(/{{CONTEXT}}/g, ctxText);
|
| 27 |
|
| 28 |
const raw = await provider.generate(prompt);
|
state_of_project.md
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# State of Project
|
| 2 |
+
|
| 3 |
+
## What works
|
| 4 |
+
- Question-first pipeline with JSONL chunk source by default; deterministic chunk IDs and JSONL caches for questions, generations, verifications, and rewards.
|
| 5 |
+
- Providers: Ollama/OpenAI/HTTP plus mock provider; mock pathway enables full pipeline tests without GPUs or ES.
|
| 6 |
+
- Verifier parsing tolerates distributor format (`SCORE` as number or `PASS`/`FAIL` with noisy prefixes); caching and retry logic in place.
|
| 7 |
+
- Tests: 42 passing (retrieval mock/real, generator, verifier, reward, pipeline behaviour, cache, full mock pipeline).
|
| 8 |
+
- CLI defaults: verbose on, question-first, JSONL chunks; chunk/question limits respected.
|
| 9 |
+
|
| 10 |
+
## What needs attention
|
| 11 |
+
- Real pipeline currently fails at question generation when Ollama/question model is unreachable; run requires a live Ollama with the specified model pulled.
|
| 12 |
+
- Reward is still mocked in the recent run; swap to real reward provider/model when available.
|
| 13 |
+
- Verifier prompt must stay distributor-provided; parsing is tolerant but malformed outputs still log raw text in verbose mode.
|
| 14 |
+
- Deprecation warning from `punycode` (Node) shows during tests; benign but noisy.
|
| 15 |
+
|
| 16 |
+
## Risks
|
| 17 |
+
- Long generator outputs can inflate verifier context; may need truncation or smaller verifier model to avoid context overruns.
|
| 18 |
+
- Cache growth: JSONL caches can grow large; add rotation/compaction if running many cycles.
|
| 19 |
+
- ES mode defaults to 100 chunk fetch if used; confirm chunk limits when switching from JSONL to ES.
|
| 20 |
+
|
| 21 |
+
## Next steps (suggested)
|
| 22 |
+
- Pull and start question/answer/verifier/reward models on Ollama (or configure OpenAI/HTTP) and re-run a small batch (`--limit`/`--chunk-limit`) to validate end-to-end with real models.
|
| 23 |
+
- Add optional verifier retry when JSON parse fails (1 retry) and cap logged transcripts to reduce noise in verbose runs.
|
| 24 |
+
- Consider a cache inspection/cleanup script for `data/cache/*.jsonl`.
|
tests/generator_core.test.mjs
CHANGED
|
@@ -2,7 +2,7 @@ import { describe, it, expect, vi } from 'vitest';
|
|
| 2 |
import { runGenerator } from '../src/generator/generator_core.mjs';
|
| 3 |
|
| 4 |
describe('generator_core.mjs (thinking generator)', () => {
|
| 5 |
-
it('includes question and context in the prompt', async () => {
|
| 6 |
const fakeContext = [
|
| 7 |
{ content: 'First context chunk' },
|
| 8 |
{ content: 'Second context chunk' },
|
|
@@ -16,12 +16,13 @@ describe('generator_core.mjs (thinking generator)', () => {
|
|
| 16 |
expect(prompt).toContain('First context chunk');
|
| 17 |
expect(prompt).toContain('Second context chunk');
|
| 18 |
|
| 19 |
-
// Return
|
| 20 |
-
return
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
| 25 |
}),
|
| 26 |
};
|
| 27 |
|
|
@@ -30,9 +31,10 @@ Love is the recognition of shared being.
|
|
| 30 |
expect(provider.generate).toHaveBeenCalledOnce();
|
| 31 |
expect(result.question).toBe('What is love?');
|
| 32 |
expect(result.context).toHaveLength(2);
|
| 33 |
-
expect(result.raw).toContain('
|
| 34 |
expect(result.answer).toBe('Love is the recognition of shared being.');
|
| 35 |
-
expect(result.thought).toContain('
|
|
|
|
| 36 |
});
|
| 37 |
|
| 38 |
it('extracts thought and answer correctly when <think> block is present', async () => {
|
|
@@ -78,8 +80,26 @@ The final answer derived from the context.`;
|
|
| 78 |
);
|
| 79 |
|
| 80 |
expect(result.raw).toBe('Just a direct answer with no visible reasoning.');
|
| 81 |
-
// No think tags means thought=null and answer = full output
|
| 82 |
expect(result.thought).toBeNull();
|
| 83 |
expect(result.answer).toBe('Just a direct answer with no visible reasoning.');
|
| 84 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
});
|
|
|
|
| 2 |
import { runGenerator } from '../src/generator/generator_core.mjs';
|
| 3 |
|
| 4 |
describe('generator_core.mjs (thinking generator)', () => {
|
| 5 |
+
it('includes question and context in the prompt and parses JSON output', async () => {
|
| 6 |
const fakeContext = [
|
| 7 |
{ content: 'First context chunk' },
|
| 8 |
{ content: 'Second context chunk' },
|
|
|
|
| 16 |
expect(prompt).toContain('First context chunk');
|
| 17 |
expect(prompt).toContain('Second context chunk');
|
| 18 |
|
| 19 |
+
// Return JSON output
|
| 20 |
+
return JSON.stringify({
|
| 21 |
+
reasoning: ['step A', 'step B'],
|
| 22 |
+
answer: 'Love is the recognition of shared being.',
|
| 23 |
+
confidence: 0.92,
|
| 24 |
+
evidence: ['quote (para #1)'],
|
| 25 |
+
});
|
| 26 |
}),
|
| 27 |
};
|
| 28 |
|
|
|
|
| 31 |
expect(provider.generate).toHaveBeenCalledOnce();
|
| 32 |
expect(result.question).toBe('What is love?');
|
| 33 |
expect(result.context).toHaveLength(2);
|
| 34 |
+
expect(result.raw).toContain('step A');
|
| 35 |
expect(result.answer).toBe('Love is the recognition of shared being.');
|
| 36 |
+
expect(result.thought).toContain('step A');
|
| 37 |
+
expect(result.confidence).toBeCloseTo(0.92);
|
| 38 |
});
|
| 39 |
|
| 40 |
it('extracts thought and answer correctly when <think> block is present', async () => {
|
|
|
|
| 80 |
);
|
| 81 |
|
| 82 |
expect(result.raw).toBe('Just a direct answer with no visible reasoning.');
|
| 83 |
+
// No JSON or think tags means thought=null and answer = full output
|
| 84 |
expect(result.thought).toBeNull();
|
| 85 |
expect(result.answer).toBe('Just a direct answer with no visible reasoning.');
|
| 86 |
});
|
| 87 |
+
|
| 88 |
+
it('parses Qwen answer block and preserves thinking object', async () => {
|
| 89 |
+
const fakeContext = [{ content: 'ctx' }];
|
| 90 |
+
const provider = {
|
| 91 |
+
generate: vi.fn(async () => ({
|
| 92 |
+
response: `<|thought|>step1<|end_of_thought|>\n<|answer|>\nConfidence: High\nAnswer: Supported answer\nEvidence: ["quote1 (loc1)", "quote2 (loc2)"]\nLimitations: None\n<|end_of_answer|>`,
|
| 93 |
+
thinking: { steps: ['t1', 't2'] },
|
| 94 |
+
})),
|
| 95 |
+
};
|
| 96 |
+
|
| 97 |
+
const result = await runGenerator('Test?', fakeContext, provider);
|
| 98 |
+
|
| 99 |
+
expect(result.thought).toEqual({ steps: ['t1', 't2'] });
|
| 100 |
+
expect(result.answer).toBe('Supported answer');
|
| 101 |
+
expect(result.confidence).toBe('High');
|
| 102 |
+
expect(result.evidence).toEqual(['quote1 (loc1)', 'quote2 (loc2)']);
|
| 103 |
+
expect(result.limitations).toBe('None');
|
| 104 |
+
});
|
| 105 |
});
|
tests/gold_preview.test.mjs
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
|
| 2 |
+
import fs from 'fs/promises';
|
| 3 |
+
import path from 'path';
|
| 4 |
+
import os from 'os';
|
| 5 |
+
import { fileURLToPath } from 'url';
|
| 6 |
+
|
| 7 |
+
const __filename = fileURLToPath(import.meta.url);
|
| 8 |
+
const __dirname = path.dirname(__filename);
|
| 9 |
+
const PROJECT_ROOT = path.join(__dirname, '..');
|
| 10 |
+
|
| 11 |
+
const sampleGold = [
|
| 12 |
+
JSON.stringify({
|
| 13 |
+
question: 'What is the meaning of life?',
|
| 14 |
+
sample: { answer: '42', raw: '42' },
|
| 15 |
+
context: [{ id: 'c1', content: 'ctx content' }],
|
| 16 |
+
verifier: { ok: true, score: 0.9 },
|
| 17 |
+
reward: { score: 0.8 },
|
| 18 |
+
}),
|
| 19 |
+
JSON.stringify({
|
| 20 |
+
question: 'Q1?',
|
| 21 |
+
sample: { answer: 'a'.repeat(50) },
|
| 22 |
+
context: [{ id: 'c2', content: 'ctx content 2' }],
|
| 23 |
+
}),
|
| 24 |
+
].join('\n');
|
| 25 |
+
|
| 26 |
+
describe('scripts/gold_preview.mjs', () => {
|
| 27 |
+
let tmpFile;
|
| 28 |
+
const origArgv = process.argv.slice();
|
| 29 |
+
|
| 30 |
+
beforeEach(async () => {
|
| 31 |
+
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'gold-prev-'));
|
| 32 |
+
tmpFile = path.join(tmpDir, 'gold.jsonl');
|
| 33 |
+
await fs.writeFile(tmpFile, sampleGold, 'utf8');
|
| 34 |
+
});
|
| 35 |
+
|
| 36 |
+
afterEach(async () => {
|
| 37 |
+
process.argv = origArgv.slice();
|
| 38 |
+
if (tmpFile) {
|
| 39 |
+
await fs.rm(path.dirname(tmpFile), { recursive: true, force: true }).catch(() => {});
|
| 40 |
+
}
|
| 41 |
+
});
|
| 42 |
+
|
| 43 |
+
it('respects --limit', async () => {
|
| 44 |
+
process.argv = ['node', 'gold_preview.mjs', '--file', tmpFile, '--limit', '1'];
|
| 45 |
+
const { capturePreview } = await import('../scripts/gold_preview.mjs');
|
| 46 |
+
const output = await capturePreview();
|
| 47 |
+
const lines = output.split('\n').filter(Boolean);
|
| 48 |
+
expect(lines.some((l) => l.startsWith('#2'))).toBe(false);
|
| 49 |
+
});
|
| 50 |
+
|
| 51 |
+
it('respects --max-answer truncation', async () => {
|
| 52 |
+
process.argv = ['node', 'gold_preview.mjs', '--file', tmpFile, '--max-answer', '10'];
|
| 53 |
+
const { capturePreview } = await import('../scripts/gold_preview.mjs');
|
| 54 |
+
const output = await capturePreview();
|
| 55 |
+
expect(output).toMatch(/A: a{10}β¦ \[\+40 chars\]/);
|
| 56 |
+
});
|
| 57 |
+
|
| 58 |
+
it('shows full when --full is set', async () => {
|
| 59 |
+
process.argv = ['node', 'gold_preview.mjs', '--file', tmpFile, '--full', '--limit', '1'];
|
| 60 |
+
const { capturePreview } = await import('../scripts/gold_preview.mjs');
|
| 61 |
+
const output = await capturePreview();
|
| 62 |
+
expect(output).toMatch(/A: 42/);
|
| 63 |
+
expect(output).not.toMatch(/\[\+\d+ chars\]/);
|
| 64 |
+
});
|
| 65 |
+
});
|
tests/regenerate_gold_from_cache.test.mjs
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
| 2 |
+
import fs from 'fs/promises';
|
| 3 |
+
import path from 'path';
|
| 4 |
+
import os from 'os';
|
| 5 |
+
import { fileURLToPath, pathToFileURL } from 'url';
|
| 6 |
+
|
| 7 |
+
const __filename = fileURLToPath(import.meta.url);
|
| 8 |
+
const __dirname = path.dirname(__filename);
|
| 9 |
+
const PROJECT_ROOT = path.join(__dirname, '..');
|
| 10 |
+
|
| 11 |
+
describe('scripts/regenerate_gold_from_cache.mjs', () => {
|
| 12 |
+
let tmpDir;
|
| 13 |
+
let cacheDir;
|
| 14 |
+
let goldPath;
|
| 15 |
+
let ragPath;
|
| 16 |
+
const origEnv = { ...process.env };
|
| 17 |
+
|
| 18 |
+
beforeEach(async () => {
|
| 19 |
+
tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'regen-cache-'));
|
| 20 |
+
cacheDir = path.join(tmpDir, 'cache');
|
| 21 |
+
goldPath = path.join(tmpDir, 'pipeline_gold.jsonl');
|
| 22 |
+
ragPath = path.join(tmpDir, 'rag_chunks.jsonl');
|
| 23 |
+
|
| 24 |
+
process.env.PIPELINE_CACHE_DIR = cacheDir;
|
| 25 |
+
process.env.GOLD_PATH = goldPath;
|
| 26 |
+
process.env.RAG_CHUNKS_PATH = ragPath;
|
| 27 |
+
|
| 28 |
+
await fs.mkdir(cacheDir, { recursive: true });
|
| 29 |
+
await fs.mkdir(path.dirname(goldPath), { recursive: true });
|
| 30 |
+
await fs.mkdir(path.dirname(ragPath), { recursive: true });
|
| 31 |
+
});
|
| 32 |
+
|
| 33 |
+
afterEach(async () => {
|
| 34 |
+
process.env = { ...origEnv };
|
| 35 |
+
await fs.rm(tmpDir, { recursive: true, force: true }).catch(() => {});
|
| 36 |
+
});
|
| 37 |
+
|
| 38 |
+
it('reconstructs gold from cache with reward/verifier ok', async () => {
|
| 39 |
+
// write chunks
|
| 40 |
+
const chunk = { id: 'c1', content: 'chunk content', source: { meta: 1 } };
|
| 41 |
+
await fs.writeFile(ragPath, JSON.stringify(chunk) + '\n', 'utf8');
|
| 42 |
+
|
| 43 |
+
// questions cache
|
| 44 |
+
await fs.writeFile(
|
| 45 |
+
path.join(cacheDir, 'questions.jsonl'),
|
| 46 |
+
JSON.stringify({
|
| 47 |
+
chunk_id: 'c1',
|
| 48 |
+
questions: ['What is X?'],
|
| 49 |
+
question_ids: ['q1'],
|
| 50 |
+
ts: Date.now(),
|
| 51 |
+
}) + '\n',
|
| 52 |
+
'utf8',
|
| 53 |
+
);
|
| 54 |
+
|
| 55 |
+
// generations cache
|
| 56 |
+
await fs.writeFile(
|
| 57 |
+
path.join(cacheDir, 'generations.jsonl'),
|
| 58 |
+
JSON.stringify({
|
| 59 |
+
chunk_id: 'c1',
|
| 60 |
+
question_id: 'q1',
|
| 61 |
+
gen_id: 'g1',
|
| 62 |
+
answer: 'Answer text',
|
| 63 |
+
raw: 'Answer text',
|
| 64 |
+
ts: Date.now(),
|
| 65 |
+
}) + '\n',
|
| 66 |
+
'utf8',
|
| 67 |
+
);
|
| 68 |
+
|
| 69 |
+
// verification cache (ok)
|
| 70 |
+
await fs.writeFile(
|
| 71 |
+
path.join(cacheDir, 'verifications.jsonl'),
|
| 72 |
+
JSON.stringify({
|
| 73 |
+
chunk_id: 'c1',
|
| 74 |
+
question_id: 'q1',
|
| 75 |
+
gen_id: 'g1',
|
| 76 |
+
ok: true,
|
| 77 |
+
score: 'PASS',
|
| 78 |
+
raw: '...',
|
| 79 |
+
ts: Date.now(),
|
| 80 |
+
}) + '\n',
|
| 81 |
+
'utf8',
|
| 82 |
+
);
|
| 83 |
+
|
| 84 |
+
// reward cache (ok)
|
| 85 |
+
await fs.writeFile(
|
| 86 |
+
path.join(cacheDir, 'rewards.jsonl'),
|
| 87 |
+
JSON.stringify({
|
| 88 |
+
chunk_id: 'c1',
|
| 89 |
+
question_id: 'q1',
|
| 90 |
+
gen_id: 'g1',
|
| 91 |
+
ok: true,
|
| 92 |
+
score: 0.9,
|
| 93 |
+
raw: '0.9',
|
| 94 |
+
ts: Date.now(),
|
| 95 |
+
}) + '\n',
|
| 96 |
+
'utf8',
|
| 97 |
+
);
|
| 98 |
+
|
| 99 |
+
const mod = await import(pathToFileURL(path.join(PROJECT_ROOT, 'scripts', 'regenerate_gold_from_cache.mjs')));
|
| 100 |
+
await mod.main();
|
| 101 |
+
|
| 102 |
+
const out = await fs.readFile(goldPath, 'utf8');
|
| 103 |
+
const lines = out.split('\n').filter(Boolean);
|
| 104 |
+
expect(lines).toHaveLength(1);
|
| 105 |
+
|
| 106 |
+
const rec = JSON.parse(lines[0]);
|
| 107 |
+
expect(rec.question).toBe('What is X?');
|
| 108 |
+
expect(rec.sample.answer).toBe('Answer text');
|
| 109 |
+
expect(rec.verifier.ok).toBe(true);
|
| 110 |
+
expect(rec.reward.score).toBe(0.9);
|
| 111 |
+
});
|
| 112 |
+
});
|