|
|
#!/usr/bin/env node |
|
|
import fs from 'fs/promises'; |
|
|
import path from 'path'; |
|
|
import { fileURLToPath } from 'url'; |
|
|
|
|
|
import { loadProviderFor } from '../providers/provider.mjs'; |
|
|
import { runQuestionGenerator } from './question_core.mjs'; |
|
|
|
|
|
const __filename = fileURLToPath(import.meta.url); |
|
|
const __dirname = path.dirname(__filename); |
|
|
|
|
|
function parseArgs(argv) { |
|
|
const opts = { |
|
|
input: null, |
|
|
output: null, |
|
|
max: 5, |
|
|
}; |
|
|
|
|
|
for (let i = 0; i < argv.length; i++) { |
|
|
const a = argv[i]; |
|
|
if (a === '--input' || a === '-i') { |
|
|
opts.input = argv[++i]; |
|
|
} else if (a === '--output' || a === '-o') { |
|
|
opts.output = argv[++i]; |
|
|
} else if (a === '--max' || a === '-m') { |
|
|
opts.max = Number(argv[++i] || '5'); |
|
|
} |
|
|
} |
|
|
|
|
|
if (!opts.input) { |
|
|
console.error( |
|
|
'Usage: node src/question/question_cli.mjs --input chunks.jsonl --output seed_questions.jsonl [--max 5]', |
|
|
); |
|
|
process.exit(1); |
|
|
} |
|
|
|
|
|
if (!opts.output) { |
|
|
opts.output = path.join(__dirname, '..', '..', 'test_samples', 'seed_questions.generated.jsonl'); |
|
|
} |
|
|
|
|
|
return opts; |
|
|
} |
|
|
|
|
|
async function* readJsonl(filePath) { |
|
|
const txt = await fs.readFile(filePath, 'utf8'); |
|
|
const lines = txt.split('\n'); |
|
|
for (const line of lines) { |
|
|
const trimmed = line.trim(); |
|
|
if (!trimmed) continue; |
|
|
try { |
|
|
yield JSON.parse(trimmed); |
|
|
} catch { |
|
|
|
|
|
yield { content: trimmed }; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
async function main() { |
|
|
const args = parseArgs(process.argv.slice(2)); |
|
|
|
|
|
const provider = loadProviderFor('question'); |
|
|
|
|
|
console.log('🧩 Question generation'); |
|
|
console.log(` Input chunks: ${args.input}`); |
|
|
console.log(` Output seeds: ${args.output}`); |
|
|
console.log(` Max questions/chunk: ${args.max}`); |
|
|
console.log(''); |
|
|
|
|
|
const outDir = path.dirname(args.output); |
|
|
await fs.mkdir(outDir, { recursive: true }); |
|
|
|
|
|
let chunkCount = 0; |
|
|
let questionCount = 0; |
|
|
let lines = ''; |
|
|
|
|
|
for await (const rec of readJsonl(args.input)) { |
|
|
chunkCount += 1; |
|
|
const id = rec.id ?? rec.doc_id ?? chunkCount; |
|
|
const contextText = rec.content ?? rec.text ?? ''; |
|
|
|
|
|
if (!contextText || !contextText.trim()) continue; |
|
|
|
|
|
console.log(`→ [${chunkCount}] generating questions for chunk id=${id}…`); |
|
|
|
|
|
const { questions } = await runQuestionGenerator(contextText, provider, { |
|
|
maxQuestions: args.max, |
|
|
}); |
|
|
|
|
|
for (const q of questions) { |
|
|
questionCount += 1; |
|
|
const seed = { |
|
|
question: q, |
|
|
source_id: id, |
|
|
|
|
|
meta: { |
|
|
doc_id: rec.doc_id ?? null, |
|
|
chunk_index: rec.chunk_index ?? null, |
|
|
}, |
|
|
}; |
|
|
lines += JSON.stringify(seed) + '\n'; |
|
|
} |
|
|
} |
|
|
|
|
|
await fs.writeFile(args.output, lines, 'utf8'); |
|
|
|
|
|
console.log(''); |
|
|
console.log('✅ Done'); |
|
|
console.log(` Chunks processed: ${chunkCount}`); |
|
|
console.log(` Questions generated: ${questionCount}`); |
|
|
console.log(` Seeds JSONL: ${args.output}`); |
|
|
} |
|
|
|
|
|
main().catch((err) => { |
|
|
console.error('❌ Question generation error:', err); |
|
|
process.exit(1); |
|
|
}); |
|
|
|