File size: 4,162 Bytes
fad3187
 
 
 
 
 
 
 
 
 
 
 
2739b3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fad3187
2739b3a
 
 
 
 
 
 
 
 
 
fad3187
 
 
 
 
 
 
 
 
2739b3a
 
fad3187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2739b3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fad3187
2739b3a
 
 
 
 
fad3187
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env node
// scripts/cache_report.mjs
// Summarize cache status (questions/generations/verifications/rewards).

import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const PROJECT_ROOT = path.join(__dirname, '..');

const DEFAULT_CACHE_DIR = path.join(PROJECT_ROOT, 'data', 'cache');
const INSTRUCT_CACHE_DIR = path.join(PROJECT_ROOT, 'data', 'cache_instruct');

const MODE = (() => {
  const v = process.env.CACHE_REPORT_MODE;
  if (!v) return 'both';
  const s = String(v).toLowerCase();
  if (['thinking', 'default'].includes(s)) return 'thinking';
  if (['instruct'].includes(s)) return 'instruct';
  if (['both', 'all'].includes(s)) return 'both';
  return 'both';
})();

const customDir = process.env.PIPELINE_CACHE_DIR
  ? (path.isAbsolute(process.env.PIPELINE_CACHE_DIR)
      ? process.env.PIPELINE_CACHE_DIR
      : path.join(PROJECT_ROOT, process.env.PIPELINE_CACHE_DIR))
  : null;

const CACHE_DIRS = (() => {
  if (customDir) {
    return [{ label: 'custom', dir: customDir }];
  }
  if (MODE === 'thinking') {
    return [{ label: 'thinking (default)', dir: DEFAULT_CACHE_DIR }];
  }
  if (MODE === 'instruct') {
    return [{ label: 'instruct', dir: INSTRUCT_CACHE_DIR }];
  }
  return [
    { label: 'thinking (default)', dir: DEFAULT_CACHE_DIR },
    { label: 'instruct', dir: INSTRUCT_CACHE_DIR },
  ];
})();

const FILES = {
  questions: 'questions.jsonl',
  generations: 'generations.jsonl',
  verifications: 'verifications.jsonl',
  rewards: 'rewards.jsonl',
};

async function readJsonl(cacheDir, fileName) {
  const filePath = path.join(cacheDir, fileName);
  try {
    const txt = await fs.readFile(filePath, 'utf8');
    return txt
      .split('\n')
      .map((l) => l.trim())
      .filter(Boolean)
      .map((line) => {
        try {
          return JSON.parse(line);
        } catch {
          return null;
        }
      })
      .filter(Boolean);
  } catch (e) {
    if (e.code === 'ENOENT') return [];
    throw e;
  }
}

function uniq(arr) {
  return [...new Set(arr)];
}

async function main() {
  if (customDir) {
    console.log(`CACHE_REPORT_MODE=custom (PIPELINE_CACHE_DIR=${customDir})`);
  } else {
    console.log(`CACHE_REPORT_MODE=${MODE}`);
  }

  for (const { label, dir } of CACHE_DIRS) {
    const questions = await readJsonl(dir, FILES.questions);
    const generations = await readJsonl(dir, FILES.generations);
    const verifications = await readJsonl(dir, FILES.verifications);
    const rewards = await readJsonl(dir, FILES.rewards);

    const chunkIds = uniq([
      ...questions.map((r) => r.chunk_id),
      ...generations.map((r) => r.chunk_id),
      ...verifications.map((r) => r.chunk_id),
      ...rewards.map((r) => r.chunk_id),
    ].filter(Boolean));

    const totalQuestions = questions.reduce((acc, r) => {
      if (Array.isArray(r.questions)) return acc + r.questions.length;
      if (Array.isArray(r.question_ids)) return acc + r.question_ids.length;
      return acc + 1;
    }, 0);

    const totalGenerations = generations.length;
    const totalVerifications = verifications.length;
    const totalRewards = rewards.length;

    const passedVerifications = verifications.filter((v) => v.ok === true).length;
    const passedRewards = rewards.filter((r) => r.ok === true).length;

    console.log(`\n== ${label} cache ==`);
    const rows = [
      ['Cache dir', dir],
      ['Unique chunks', chunkIds.length],
      ['Question records', questions.length],
      ['Questions total', totalQuestions],
      ['Generation records', totalGenerations],
      ['Verification records', totalVerifications],
      ['Verifications ok', passedVerifications],
      ['Reward records', totalRewards],
      ['Rewards ok', passedRewards],
    ];

    const colWidth = Math.max(...rows.map(([k]) => k.length)) + 2;
    for (const [key, val] of rows) {
      const pad = ' '.repeat(colWidth - key.length);
      console.log(`${key}:${pad}${val}`);
    }
  }
}

main().catch((err) => {
  console.error('Cache report error:', err);
  process.exit(1);
});