distill-pipeline / src /question /question_cli.mjs
htaf's picture
added question generator
ebd14c3
#!/usr/bin/env node
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
import { loadProviderFor } from '../providers/provider.mjs';
import { runQuestionGenerator } from './question_core.mjs';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
function parseArgs(argv) {
const opts = {
input: null,
output: null,
max: 5,
};
for (let i = 0; i < argv.length; i++) {
const a = argv[i];
if (a === '--input' || a === '-i') {
opts.input = argv[++i];
} else if (a === '--output' || a === '-o') {
opts.output = argv[++i];
} else if (a === '--max' || a === '-m') {
opts.max = Number(argv[++i] || '5');
}
}
if (!opts.input) {
console.error(
'Usage: node src/question/question_cli.mjs --input chunks.jsonl --output seed_questions.jsonl [--max 5]',
);
process.exit(1);
}
if (!opts.output) {
opts.output = path.join(__dirname, '..', '..', 'test_samples', 'seed_questions.generated.jsonl');
}
return opts;
}
async function* readJsonl(filePath) {
const txt = await fs.readFile(filePath, 'utf8');
const lines = txt.split('\n');
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed) continue;
try {
yield JSON.parse(trimmed);
} catch {
// treat raw string as content
yield { content: trimmed };
}
}
}
async function main() {
const args = parseArgs(process.argv.slice(2));
const provider = loadProviderFor('question');
console.log('🧩 Question generation');
console.log(` Input chunks: ${args.input}`);
console.log(` Output seeds: ${args.output}`);
console.log(` Max questions/chunk: ${args.max}`);
console.log('');
const outDir = path.dirname(args.output);
await fs.mkdir(outDir, { recursive: true });
let chunkCount = 0;
let questionCount = 0;
let lines = '';
for await (const rec of readJsonl(args.input)) {
chunkCount += 1;
const id = rec.id ?? rec.doc_id ?? chunkCount;
const contextText = rec.content ?? rec.text ?? '';
if (!contextText || !contextText.trim()) continue;
console.log(`→ [${chunkCount}] generating questions for chunk id=${id}…`);
const { questions } = await runQuestionGenerator(contextText, provider, {
maxQuestions: args.max,
});
for (const q of questions) {
questionCount += 1;
const seed = {
question: q,
source_id: id,
// keep some trace of where it came from
meta: {
doc_id: rec.doc_id ?? null,
chunk_index: rec.chunk_index ?? null,
},
};
lines += JSON.stringify(seed) + '\n';
}
}
await fs.writeFile(args.output, lines, 'utf8');
console.log('');
console.log('✅ Done');
console.log(` Chunks processed: ${chunkCount}`);
console.log(` Questions generated: ${questionCount}`);
console.log(` Seeds JSONL: ${args.output}`);
}
main().catch((err) => {
console.error('❌ Question generation error:', err);
process.exit(1);
});