File size: 3,086 Bytes
ebd14c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
#!/usr/bin/env node
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
import { loadProviderFor } from '../providers/provider.mjs';
import { runQuestionGenerator } from './question_core.mjs';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
function parseArgs(argv) {
const opts = {
input: null,
output: null,
max: 5,
};
for (let i = 0; i < argv.length; i++) {
const a = argv[i];
if (a === '--input' || a === '-i') {
opts.input = argv[++i];
} else if (a === '--output' || a === '-o') {
opts.output = argv[++i];
} else if (a === '--max' || a === '-m') {
opts.max = Number(argv[++i] || '5');
}
}
if (!opts.input) {
console.error(
'Usage: node src/question/question_cli.mjs --input chunks.jsonl --output seed_questions.jsonl [--max 5]',
);
process.exit(1);
}
if (!opts.output) {
opts.output = path.join(__dirname, '..', '..', 'test_samples', 'seed_questions.generated.jsonl');
}
return opts;
}
async function* readJsonl(filePath) {
const txt = await fs.readFile(filePath, 'utf8');
const lines = txt.split('\n');
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed) continue;
try {
yield JSON.parse(trimmed);
} catch {
// treat raw string as content
yield { content: trimmed };
}
}
}
async function main() {
const args = parseArgs(process.argv.slice(2));
const provider = loadProviderFor('question');
console.log('🧩 Question generation');
console.log(` Input chunks: ${args.input}`);
console.log(` Output seeds: ${args.output}`);
console.log(` Max questions/chunk: ${args.max}`);
console.log('');
const outDir = path.dirname(args.output);
await fs.mkdir(outDir, { recursive: true });
let chunkCount = 0;
let questionCount = 0;
let lines = '';
for await (const rec of readJsonl(args.input)) {
chunkCount += 1;
const id = rec.id ?? rec.doc_id ?? chunkCount;
const contextText = rec.content ?? rec.text ?? '';
if (!contextText || !contextText.trim()) continue;
console.log(`→ [${chunkCount}] generating questions for chunk id=${id}…`);
const { questions } = await runQuestionGenerator(contextText, provider, {
maxQuestions: args.max,
});
for (const q of questions) {
questionCount += 1;
const seed = {
question: q,
source_id: id,
// keep some trace of where it came from
meta: {
doc_id: rec.doc_id ?? null,
chunk_index: rec.chunk_index ?? null,
},
};
lines += JSON.stringify(seed) + '\n';
}
}
await fs.writeFile(args.output, lines, 'utf8');
console.log('');
console.log('✅ Done');
console.log(` Chunks processed: ${chunkCount}`);
console.log(` Questions generated: ${questionCount}`);
console.log(` Seeds JSONL: ${args.output}`);
}
main().catch((err) => {
console.error('❌ Question generation error:', err);
process.exit(1);
});
|