#!/usr/bin/env node // scripts/cache_report.mjs // Summarize cache status (questions/generations/verifications/rewards). import fs from 'fs/promises'; import path from 'path'; import { fileURLToPath } from 'url'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const PROJECT_ROOT = path.join(__dirname, '..'); const DEFAULT_CACHE_DIR = path.join(PROJECT_ROOT, 'data', 'cache'); const INSTRUCT_CACHE_DIR = path.join(PROJECT_ROOT, 'data', 'cache_instruct'); const MODE = (() => { const v = process.env.CACHE_REPORT_MODE; if (!v) return 'both'; const s = String(v).toLowerCase(); if (['thinking', 'default'].includes(s)) return 'thinking'; if (['instruct'].includes(s)) return 'instruct'; if (['both', 'all'].includes(s)) return 'both'; return 'both'; })(); const customDir = process.env.PIPELINE_CACHE_DIR ? (path.isAbsolute(process.env.PIPELINE_CACHE_DIR) ? process.env.PIPELINE_CACHE_DIR : path.join(PROJECT_ROOT, process.env.PIPELINE_CACHE_DIR)) : null; const CACHE_DIRS = (() => { if (customDir) { return [{ label: 'custom', dir: customDir }]; } if (MODE === 'thinking') { return [{ label: 'thinking (default)', dir: DEFAULT_CACHE_DIR }]; } if (MODE === 'instruct') { return [{ label: 'instruct', dir: INSTRUCT_CACHE_DIR }]; } return [ { label: 'thinking (default)', dir: DEFAULT_CACHE_DIR }, { label: 'instruct', dir: INSTRUCT_CACHE_DIR }, ]; })(); const FILES = { questions: 'questions.jsonl', generations: 'generations.jsonl', verifications: 'verifications.jsonl', rewards: 'rewards.jsonl', }; async function readJsonl(cacheDir, fileName) { const filePath = path.join(cacheDir, fileName); try { const txt = await fs.readFile(filePath, 'utf8'); return txt .split('\n') .map((l) => l.trim()) .filter(Boolean) .map((line) => { try { return JSON.parse(line); } catch { return null; } }) .filter(Boolean); } catch (e) { if (e.code === 'ENOENT') return []; throw e; } } function uniq(arr) { return [...new Set(arr)]; } async function main() { if (customDir) { console.log(`CACHE_REPORT_MODE=custom (PIPELINE_CACHE_DIR=${customDir})`); } else { console.log(`CACHE_REPORT_MODE=${MODE}`); } for (const { label, dir } of CACHE_DIRS) { const questions = await readJsonl(dir, FILES.questions); const generations = await readJsonl(dir, FILES.generations); const verifications = await readJsonl(dir, FILES.verifications); const rewards = await readJsonl(dir, FILES.rewards); const chunkIds = uniq([ ...questions.map((r) => r.chunk_id), ...generations.map((r) => r.chunk_id), ...verifications.map((r) => r.chunk_id), ...rewards.map((r) => r.chunk_id), ].filter(Boolean)); const totalQuestions = questions.reduce((acc, r) => { if (Array.isArray(r.questions)) return acc + r.questions.length; if (Array.isArray(r.question_ids)) return acc + r.question_ids.length; return acc + 1; }, 0); const totalGenerations = generations.length; const totalVerifications = verifications.length; const totalRewards = rewards.length; const passedVerifications = verifications.filter((v) => v.ok === true).length; const passedRewards = rewards.filter((r) => r.ok === true).length; console.log(`\n== ${label} cache ==`); const rows = [ ['Cache dir', dir], ['Unique chunks', chunkIds.length], ['Question records', questions.length], ['Questions total', totalQuestions], ['Generation records', totalGenerations], ['Verification records', totalVerifications], ['Verifications ok', passedVerifications], ['Reward records', totalRewards], ['Rewards ok', passedRewards], ]; const colWidth = Math.max(...rows.map(([k]) => k.length)) + 2; for (const [key, val] of rows) { const pad = ' '.repeat(colWidth - key.length); console.log(`${key}:${pad}${val}`); } } } main().catch((err) => { console.error('Cache report error:', err); process.exit(1); });