distill-pipeline / scripts /cache_report.mjs
htaf's picture
added new instruct pipeline for faster generation
2739b3a
#!/usr/bin/env node
// scripts/cache_report.mjs
// Summarize cache status (questions/generations/verifications/rewards).
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const PROJECT_ROOT = path.join(__dirname, '..');
const DEFAULT_CACHE_DIR = path.join(PROJECT_ROOT, 'data', 'cache');
const INSTRUCT_CACHE_DIR = path.join(PROJECT_ROOT, 'data', 'cache_instruct');
const MODE = (() => {
const v = process.env.CACHE_REPORT_MODE;
if (!v) return 'both';
const s = String(v).toLowerCase();
if (['thinking', 'default'].includes(s)) return 'thinking';
if (['instruct'].includes(s)) return 'instruct';
if (['both', 'all'].includes(s)) return 'both';
return 'both';
})();
const customDir = process.env.PIPELINE_CACHE_DIR
? (path.isAbsolute(process.env.PIPELINE_CACHE_DIR)
? process.env.PIPELINE_CACHE_DIR
: path.join(PROJECT_ROOT, process.env.PIPELINE_CACHE_DIR))
: null;
const CACHE_DIRS = (() => {
if (customDir) {
return [{ label: 'custom', dir: customDir }];
}
if (MODE === 'thinking') {
return [{ label: 'thinking (default)', dir: DEFAULT_CACHE_DIR }];
}
if (MODE === 'instruct') {
return [{ label: 'instruct', dir: INSTRUCT_CACHE_DIR }];
}
return [
{ label: 'thinking (default)', dir: DEFAULT_CACHE_DIR },
{ label: 'instruct', dir: INSTRUCT_CACHE_DIR },
];
})();
const FILES = {
questions: 'questions.jsonl',
generations: 'generations.jsonl',
verifications: 'verifications.jsonl',
rewards: 'rewards.jsonl',
};
async function readJsonl(cacheDir, fileName) {
const filePath = path.join(cacheDir, fileName);
try {
const txt = await fs.readFile(filePath, 'utf8');
return txt
.split('\n')
.map((l) => l.trim())
.filter(Boolean)
.map((line) => {
try {
return JSON.parse(line);
} catch {
return null;
}
})
.filter(Boolean);
} catch (e) {
if (e.code === 'ENOENT') return [];
throw e;
}
}
function uniq(arr) {
return [...new Set(arr)];
}
async function main() {
if (customDir) {
console.log(`CACHE_REPORT_MODE=custom (PIPELINE_CACHE_DIR=${customDir})`);
} else {
console.log(`CACHE_REPORT_MODE=${MODE}`);
}
for (const { label, dir } of CACHE_DIRS) {
const questions = await readJsonl(dir, FILES.questions);
const generations = await readJsonl(dir, FILES.generations);
const verifications = await readJsonl(dir, FILES.verifications);
const rewards = await readJsonl(dir, FILES.rewards);
const chunkIds = uniq([
...questions.map((r) => r.chunk_id),
...generations.map((r) => r.chunk_id),
...verifications.map((r) => r.chunk_id),
...rewards.map((r) => r.chunk_id),
].filter(Boolean));
const totalQuestions = questions.reduce((acc, r) => {
if (Array.isArray(r.questions)) return acc + r.questions.length;
if (Array.isArray(r.question_ids)) return acc + r.question_ids.length;
return acc + 1;
}, 0);
const totalGenerations = generations.length;
const totalVerifications = verifications.length;
const totalRewards = rewards.length;
const passedVerifications = verifications.filter((v) => v.ok === true).length;
const passedRewards = rewards.filter((r) => r.ok === true).length;
console.log(`\n== ${label} cache ==`);
const rows = [
['Cache dir', dir],
['Unique chunks', chunkIds.length],
['Question records', questions.length],
['Questions total', totalQuestions],
['Generation records', totalGenerations],
['Verification records', totalVerifications],
['Verifications ok', passedVerifications],
['Reward records', totalRewards],
['Rewards ok', passedRewards],
];
const colWidth = Math.max(...rows.map(([k]) => k.length)) + 2;
for (const [key, val] of rows) {
const pad = ' '.repeat(colWidth - key.length);
console.log(`${key}:${pad}${val}`);
}
}
}
main().catch((err) => {
console.error('Cache report error:', err);
process.exit(1);
});