| | #!/usr/bin/env node |
| | |
| | |
| |
|
| | import fs from 'fs/promises'; |
| | import path from 'path'; |
| | import { fileURLToPath } from 'url'; |
| | import { loadRagChunks } from '../src/retrieval/jsonl_chunks.mjs'; |
| | import { |
| | questionId, |
| | chunkIdFromContent, |
| | } from '../src/pipeline/cache.mjs'; |
| |
|
| | const __filename = fileURLToPath(import.meta.url); |
| | const __dirname = path.dirname(__filename); |
| | const PROJECT_ROOT = path.join(__dirname, '..'); |
| |
|
| | const CACHE_DIR = (() => { |
| | const custom = process.env.PIPELINE_CACHE_DIR; |
| | if (custom) { |
| | return path.isAbsolute(custom) |
| | ? custom |
| | : path.join(PROJECT_ROOT, custom); |
| | } |
| | return path.join(PROJECT_ROOT, 'data', 'cache'); |
| | })(); |
| |
|
| | const GOLD_PATH = |
| | process.env.GOLD_PATH || |
| | path.join(PROJECT_ROOT, 'gold', 'pipeline_gold.jsonl'); |
| |
|
| | const CACHE_FILES = { |
| | questions: 'questions.jsonl', |
| | generations: 'generations.jsonl', |
| | verifications: 'verifications.jsonl', |
| | rewards: 'rewards.jsonl', |
| | }; |
| |
|
| | async function readJsonl(fileName) { |
| | const filePath = path.join(CACHE_DIR, fileName); |
| | try { |
| | const txt = await fs.readFile(filePath, 'utf8'); |
| | return txt |
| | .split('\n') |
| | .map((l) => l.trim()) |
| | .filter(Boolean) |
| | .map((line) => { |
| | try { |
| | return JSON.parse(line); |
| | } catch { |
| | return null; |
| | } |
| | }) |
| | .filter(Boolean); |
| | } catch (e) { |
| | if (e.code === 'ENOENT') return []; |
| | throw e; |
| | } |
| | } |
| |
|
| | function compositeKey(...parts) { |
| | return parts.filter(Boolean).join('|'); |
| | } |
| |
|
| | async function loadChunksMap() { |
| | const chunks = await loadRagChunks(); |
| | const map = new Map(); |
| | for (const c of chunks) { |
| | const cid = c.id || chunkIdFromContent(c.content, c.sourceId || c.source?.id); |
| | map.set(cid, c); |
| | } |
| | return map; |
| | } |
| |
|
| | function latestByTs(records, keyFn) { |
| | const map = new Map(); |
| | for (const r of records) { |
| | const key = keyFn(r); |
| | if (!key) continue; |
| | const existing = map.get(key); |
| | if (!existing || (r.ts && (!existing.ts || r.ts > existing.ts))) { |
| | map.set(key, r); |
| | } |
| | } |
| | return map; |
| | } |
| |
|
| | function rewardOk(r) { |
| | if (!r) return false; |
| | if (r.ok === true) return true; |
| | if (typeof r.score === 'number') return r.score >= 0.5; |
| | if (typeof r.score === 'string') { |
| | const t = r.score.trim().toLowerCase(); |
| | if (t === 'pass') return true; |
| | const num = Number(r.score); |
| | if (Number.isFinite(num)) return num >= 0.5; |
| | } |
| | return false; |
| | } |
| |
|
| | async function main() { |
| | const [questions, generations, verifications, rewards] = await Promise.all([ |
| | readJsonl(CACHE_FILES.questions), |
| | readJsonl(CACHE_FILES.generations), |
| | readJsonl(CACHE_FILES.verifications), |
| | readJsonl(CACHE_FILES.rewards), |
| | ]); |
| |
|
| | const chunkMap = await loadChunksMap(); |
| |
|
| | |
| | const qMap = new Map(); |
| | for (const rec of questions) { |
| | const chunkId = rec.chunk_id; |
| | if (!chunkId) continue; |
| | const qs = Array.isArray(rec.questions) |
| | ? rec.questions |
| | : rec.question |
| | ? [rec.question] |
| | : []; |
| | const qIds = Array.isArray(rec.question_ids) ? rec.question_ids : []; |
| |
|
| | for (let i = 0; i < qs.length; i++) { |
| | const q = qs[i]; |
| | const providedId = qIds[i]; |
| | const hashedId = questionId(chunkId, q); |
| | if (providedId) { |
| | qMap.set(compositeKey(chunkId, providedId), q); |
| | } |
| | qMap.set(compositeKey(chunkId, hashedId), q); |
| | } |
| | } |
| |
|
| | |
| | const genMap = latestByTs(generations, (g) => |
| | compositeKey(g.chunk_id, g.question_id), |
| | ); |
| |
|
| | |
| | const verMap = latestByTs(verifications, (v) => |
| | compositeKey(v.chunk_id, v.question_id, v.gen_id), |
| | ); |
| |
|
| | |
| | const rewMap = latestByTs(rewards, (r) => |
| | compositeKey(r.chunk_id, r.question_id, r.gen_id), |
| | ); |
| |
|
| | const out = []; |
| | let accepted = 0; |
| | for (const [key, gen] of genMap.entries()) { |
| | const [chunkId, qId] = key.split('|'); |
| | const question = qMap.get(compositeKey(chunkId, qId)) || '[unknown question]'; |
| | const chunk = chunkMap.get(chunkId) || {}; |
| | const context = [{ id: chunkId, content: chunk.content ?? chunk.text ?? '' }]; |
| | const ver = verMap.get(compositeKey(chunkId, qId, gen.gen_id)); |
| | const rew = rewMap.get(compositeKey(chunkId, qId, gen.gen_id)); |
| |
|
| | const rewardIsOk = rewardOk(rew); |
| | const verifierIsOk = ver?.ok === true; |
| | if (!rewardIsOk && !verifierIsOk) continue; |
| | accepted += 1; |
| |
|
| | out.push({ |
| | question, |
| | sourceChunkId: chunkId, |
| | sourceChunk: chunk.content ?? chunk.text, |
| | sourceDoc: chunk.source, |
| | context, |
| | sample: gen, |
| | verifier: ver, |
| | reward: rew, |
| | }); |
| | } |
| |
|
| | const lines = out.map((r) => JSON.stringify(r)); |
| | await fs.mkdir(path.dirname(GOLD_PATH), { recursive: true }); |
| | await fs.writeFile(GOLD_PATH, lines.join('\n') + '\n', 'utf8'); |
| |
|
| | console.log(`Regenerated gold at ${GOLD_PATH}`); |
| | console.log(`Accepted records: ${accepted}`); |
| | console.log(`Total written: ${out.length}`); |
| | } |
| |
|
| | if (import.meta.url === `file://${__filename}`) { |
| | main().catch((err) => { |
| | console.error('Regenerate gold error:', err); |
| | process.exit(1); |
| | }); |
| | } |
| |
|
| | export { main }; |
| |
|