added question generator
Browse files- package.json +2 -1
- prompts/question_prompt.txt +30 -0
- src/question/question_cli.mjs +117 -0
- src/question/question_core.mjs +70 -0
- tests/question_core.test.mjs +69 -0
package.json
CHANGED
|
@@ -4,7 +4,8 @@
|
|
| 4 |
"type": "module",
|
| 5 |
"scripts": {
|
| 6 |
"test": "vitest --run",
|
| 7 |
-
"pipeline": "node ./src/pipeline/pipeline_cli.js"
|
|
|
|
| 8 |
},
|
| 9 |
"devDependencies": {
|
| 10 |
"vitest": "^1.6.0"
|
|
|
|
| 4 |
"type": "module",
|
| 5 |
"scripts": {
|
| 6 |
"test": "vitest --run",
|
| 7 |
+
"pipeline": "node ./src/pipeline/pipeline_cli.js",
|
| 8 |
+
"qgen": "node ./src/question/question_cli.mjs"
|
| 9 |
},
|
| 10 |
"devDependencies": {
|
| 11 |
"vitest": "^1.6.0"
|
prompts/question_prompt.txt
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are a dataset-creation assistant.
|
| 2 |
+
|
| 3 |
+
You will be given a CONTEXT CHUNK of text from a larger corpus.
|
| 4 |
+
|
| 5 |
+
Your task:
|
| 6 |
+
|
| 7 |
+
1. Read the context carefully.
|
| 8 |
+
2. Write up to {{MAX_QUESTIONS}} diverse, high-quality questions
|
| 9 |
+
that can be answered ONLY from this context.
|
| 10 |
+
3. Prefer questions that:
|
| 11 |
+
- are conceptually interesting,
|
| 12 |
+
- require some reasoning or synthesis within the chunk,
|
| 13 |
+
- are answerable without outside knowledge.
|
| 14 |
+
|
| 15 |
+
Output STRICTLY in JSON with this shape:
|
| 16 |
+
|
| 17 |
+
{
|
| 18 |
+
"questions": [
|
| 19 |
+
"Question 1?",
|
| 20 |
+
"Question 2?",
|
| 21 |
+
"Question 3?"
|
| 22 |
+
]
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
Do NOT include answers in the JSON. Only questions.
|
| 26 |
+
|
| 27 |
+
---
|
| 28 |
+
CONTEXT START
|
| 29 |
+
{{CONTEXT}}
|
| 30 |
+
CONTEXT END
|
src/question/question_cli.mjs
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
import fs from 'fs/promises';
|
| 3 |
+
import path from 'path';
|
| 4 |
+
import { fileURLToPath } from 'url';
|
| 5 |
+
|
| 6 |
+
import { loadProviderFor } from '../providers/provider.mjs';
|
| 7 |
+
import { runQuestionGenerator } from './question_core.mjs';
|
| 8 |
+
|
| 9 |
+
const __filename = fileURLToPath(import.meta.url);
|
| 10 |
+
const __dirname = path.dirname(__filename);
|
| 11 |
+
|
| 12 |
+
function parseArgs(argv) {
|
| 13 |
+
const opts = {
|
| 14 |
+
input: null,
|
| 15 |
+
output: null,
|
| 16 |
+
max: 5,
|
| 17 |
+
};
|
| 18 |
+
|
| 19 |
+
for (let i = 0; i < argv.length; i++) {
|
| 20 |
+
const a = argv[i];
|
| 21 |
+
if (a === '--input' || a === '-i') {
|
| 22 |
+
opts.input = argv[++i];
|
| 23 |
+
} else if (a === '--output' || a === '-o') {
|
| 24 |
+
opts.output = argv[++i];
|
| 25 |
+
} else if (a === '--max' || a === '-m') {
|
| 26 |
+
opts.max = Number(argv[++i] || '5');
|
| 27 |
+
}
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
if (!opts.input) {
|
| 31 |
+
console.error(
|
| 32 |
+
'Usage: node src/question/question_cli.mjs --input chunks.jsonl --output seed_questions.jsonl [--max 5]',
|
| 33 |
+
);
|
| 34 |
+
process.exit(1);
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
if (!opts.output) {
|
| 38 |
+
opts.output = path.join(__dirname, '..', '..', 'test_samples', 'seed_questions.generated.jsonl');
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
return opts;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
async function* readJsonl(filePath) {
|
| 45 |
+
const txt = await fs.readFile(filePath, 'utf8');
|
| 46 |
+
const lines = txt.split('\n');
|
| 47 |
+
for (const line of lines) {
|
| 48 |
+
const trimmed = line.trim();
|
| 49 |
+
if (!trimmed) continue;
|
| 50 |
+
try {
|
| 51 |
+
yield JSON.parse(trimmed);
|
| 52 |
+
} catch {
|
| 53 |
+
// treat raw string as content
|
| 54 |
+
yield { content: trimmed };
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
async function main() {
|
| 60 |
+
const args = parseArgs(process.argv.slice(2));
|
| 61 |
+
|
| 62 |
+
const provider = loadProviderFor('question');
|
| 63 |
+
|
| 64 |
+
console.log('🧩 Question generation');
|
| 65 |
+
console.log(` Input chunks: ${args.input}`);
|
| 66 |
+
console.log(` Output seeds: ${args.output}`);
|
| 67 |
+
console.log(` Max questions/chunk: ${args.max}`);
|
| 68 |
+
console.log('');
|
| 69 |
+
|
| 70 |
+
const outDir = path.dirname(args.output);
|
| 71 |
+
await fs.mkdir(outDir, { recursive: true });
|
| 72 |
+
|
| 73 |
+
let chunkCount = 0;
|
| 74 |
+
let questionCount = 0;
|
| 75 |
+
let lines = '';
|
| 76 |
+
|
| 77 |
+
for await (const rec of readJsonl(args.input)) {
|
| 78 |
+
chunkCount += 1;
|
| 79 |
+
const id = rec.id ?? rec.doc_id ?? chunkCount;
|
| 80 |
+
const contextText = rec.content ?? rec.text ?? '';
|
| 81 |
+
|
| 82 |
+
if (!contextText || !contextText.trim()) continue;
|
| 83 |
+
|
| 84 |
+
console.log(`→ [${chunkCount}] generating questions for chunk id=${id}…`);
|
| 85 |
+
|
| 86 |
+
const { questions } = await runQuestionGenerator(contextText, provider, {
|
| 87 |
+
maxQuestions: args.max,
|
| 88 |
+
});
|
| 89 |
+
|
| 90 |
+
for (const q of questions) {
|
| 91 |
+
questionCount += 1;
|
| 92 |
+
const seed = {
|
| 93 |
+
question: q,
|
| 94 |
+
source_id: id,
|
| 95 |
+
// keep some trace of where it came from
|
| 96 |
+
meta: {
|
| 97 |
+
doc_id: rec.doc_id ?? null,
|
| 98 |
+
chunk_index: rec.chunk_index ?? null,
|
| 99 |
+
},
|
| 100 |
+
};
|
| 101 |
+
lines += JSON.stringify(seed) + '\n';
|
| 102 |
+
}
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
await fs.writeFile(args.output, lines, 'utf8');
|
| 106 |
+
|
| 107 |
+
console.log('');
|
| 108 |
+
console.log('✅ Done');
|
| 109 |
+
console.log(` Chunks processed: ${chunkCount}`);
|
| 110 |
+
console.log(` Questions generated: ${questionCount}`);
|
| 111 |
+
console.log(` Seeds JSONL: ${args.output}`);
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
main().catch((err) => {
|
| 115 |
+
console.error('❌ Question generation error:', err);
|
| 116 |
+
process.exit(1);
|
| 117 |
+
});
|
src/question/question_core.mjs
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// src/question/question_core.mjs
|
| 2 |
+
import fs from 'fs/promises';
|
| 3 |
+
import path from 'path';
|
| 4 |
+
|
| 5 |
+
async function loadQuestionTemplate() {
|
| 6 |
+
const filePath = path.resolve(
|
| 7 |
+
path.dirname(new URL(import.meta.url).pathname),
|
| 8 |
+
'..',
|
| 9 |
+
'..',
|
| 10 |
+
'prompts',
|
| 11 |
+
'question_prompt.txt',
|
| 12 |
+
);
|
| 13 |
+
return await fs.readFile(filePath, 'utf8');
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
/**
|
| 17 |
+
* runQuestionGenerator
|
| 18 |
+
*
|
| 19 |
+
* @param {string} contextText - text chunk we want questions about
|
| 20 |
+
* @param {object} provider - { generate(prompt) → string }
|
| 21 |
+
* @param {object} options
|
| 22 |
+
* - maxQuestions: how many questions to ask for
|
| 23 |
+
*
|
| 24 |
+
* @returns {object} {
|
| 25 |
+
* questions: string[],
|
| 26 |
+
* raw: string,
|
| 27 |
+
* parsed: any
|
| 28 |
+
* }
|
| 29 |
+
*/
|
| 30 |
+
export async function runQuestionGenerator(
|
| 31 |
+
contextText,
|
| 32 |
+
provider,
|
| 33 |
+
{ maxQuestions = 5 } = {},
|
| 34 |
+
) {
|
| 35 |
+
if (!provider || typeof provider.generate !== 'function') {
|
| 36 |
+
throw new Error('runQuestionGenerator: provider.generate() not found');
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
const template = await loadQuestionTemplate();
|
| 40 |
+
|
| 41 |
+
const prompt = template
|
| 42 |
+
.replace('{{CONTEXT}}', contextText)
|
| 43 |
+
.replace('{{MAX_QUESTIONS}}', String(maxQuestions));
|
| 44 |
+
|
| 45 |
+
const raw = await provider.generate(prompt);
|
| 46 |
+
|
| 47 |
+
let parsed;
|
| 48 |
+
try {
|
| 49 |
+
parsed = JSON.parse(raw);
|
| 50 |
+
} catch {
|
| 51 |
+
parsed = { error: 'invalid_json', raw };
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
let questions = [];
|
| 55 |
+
|
| 56 |
+
if (Array.isArray(parsed?.questions)) {
|
| 57 |
+
questions = parsed.questions.map((q) => String(q).trim()).filter(Boolean);
|
| 58 |
+
} else if (Array.isArray(parsed)) {
|
| 59 |
+
questions = parsed.map((q) => String(q).trim()).filter(Boolean);
|
| 60 |
+
} else if (typeof parsed?.question === 'string') {
|
| 61 |
+
questions = [parsed.question.trim()];
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
return { questions, raw, parsed };
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
export default {
|
| 68 |
+
runQuestionGenerator,
|
| 69 |
+
};
|
| 70 |
+
|
tests/question_core.test.mjs
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// tests/question_core.test.mjs
|
| 2 |
+
import { describe, it, expect } from "vitest";
|
| 3 |
+
import { runQuestionGenerator } from "../src/question/question_core.mjs";
|
| 4 |
+
|
| 5 |
+
describe("runQuestionGenerator", () => {
|
| 6 |
+
it("extracts questions from { questions: [...] } JSON", async () => {
|
| 7 |
+
const fakeProvider = {
|
| 8 |
+
async generate(prompt) {
|
| 9 |
+
// we don't care about prompt content here, just the shape
|
| 10 |
+
return JSON.stringify({
|
| 11 |
+
questions: [
|
| 12 |
+
"What is love?",
|
| 13 |
+
"How can I serve others?",
|
| 14 |
+
],
|
| 15 |
+
});
|
| 16 |
+
},
|
| 17 |
+
};
|
| 18 |
+
|
| 19 |
+
const { questions, raw, parsed } = await runQuestionGenerator(
|
| 20 |
+
"Some context chunk about service and love.",
|
| 21 |
+
fakeProvider,
|
| 22 |
+
{ maxQuestions: 5 },
|
| 23 |
+
);
|
| 24 |
+
|
| 25 |
+
expect(Array.isArray(questions)).toBe(true);
|
| 26 |
+
expect(questions.length).toBe(2);
|
| 27 |
+
expect(questions[0]).toBe("What is love?");
|
| 28 |
+
expect(questions[1]).toBe("How can I serve others?");
|
| 29 |
+
expect(typeof raw).toBe("string");
|
| 30 |
+
expect(parsed).toHaveProperty("questions");
|
| 31 |
+
});
|
| 32 |
+
|
| 33 |
+
it("handles array root JSON by treating it as a questions list", async () => {
|
| 34 |
+
const fakeProvider = {
|
| 35 |
+
async generate() {
|
| 36 |
+
return JSON.stringify([
|
| 37 |
+
"Question A?",
|
| 38 |
+
"Question B?",
|
| 39 |
+
]);
|
| 40 |
+
},
|
| 41 |
+
};
|
| 42 |
+
|
| 43 |
+
const { questions } = await runQuestionGenerator(
|
| 44 |
+
"Another context chunk.",
|
| 45 |
+
fakeProvider,
|
| 46 |
+
{ maxQuestions: 3 },
|
| 47 |
+
);
|
| 48 |
+
|
| 49 |
+
expect(questions).toEqual(["Question A?", "Question B?"]);
|
| 50 |
+
});
|
| 51 |
+
|
| 52 |
+
it("returns empty list on invalid JSON", async () => {
|
| 53 |
+
const fakeProvider = {
|
| 54 |
+
async generate() {
|
| 55 |
+
return "this is not json";
|
| 56 |
+
},
|
| 57 |
+
};
|
| 58 |
+
|
| 59 |
+
const { questions, parsed } = await runQuestionGenerator(
|
| 60 |
+
"Context that triggers bad output.",
|
| 61 |
+
fakeProvider,
|
| 62 |
+
{ maxQuestions: 3 },
|
| 63 |
+
);
|
| 64 |
+
|
| 65 |
+
expect(Array.isArray(questions)).toBe(true);
|
| 66 |
+
expect(questions.length).toBe(0);
|
| 67 |
+
expect(parsed).toHaveProperty("error", "invalid_json");
|
| 68 |
+
});
|
| 69 |
+
});
|